library(tidyverse)
library(dplyr)
library(readr)
library(readxl)
library(vdemdata)
library(survey)
library(countrycode)
library(rvest)
library(stringr)
library(mice)
library(VIM)
library(visdat)
library(naniar)
library(caret)
library(haven)
library(magrittr)
library(sjlabelled)
library(mice)
library(tidyr)
library(ggplot2)
library(scales)
library(viridis)
library(kableExtra)
library(corrplot)
library(cluster)
library(factoextra)
library(ggrepel)
library(randomForest)
library(DALEX)
library(ranger)
library(lme4)
library(lmerTest)
library(sjstats)
library(sjPlot)
library(modelsummary)
library(broom.mixed)
library(stargazer)
library(car)
library(rtf)
library(MuMIn)
# mapping -----------------------------------------------------------------
# we comment out the country codes as we likely don't need them
survey_country_mapping <- data.frame(
country_name = c("France", "Belgium", "Netherlands", "Germany", "Italy",
"Luxembourg", "Denmark", "Ireland", "United Kingdom", "Greece",
"Spain", "Portugal", "Finland", "Sweden",
"Austria", "Cyprus", "Czech Republic", "Estonia", "Hungary",
"Latvia", "Lithuania", "Malta", "Poland", "Slovakia",
"Slovenia", "Bulgaria", "Romania", "Croatia"),
#survey_country_code = c(1, 2, 3, 4, 5,
# 6, 7, 8, 9, 11,
# 12, 13, 16, 17,
# 18, 19, 20, 21, 22,
# 23, 24, 25, 26, 27,
# 28, 29, 30, 32),
iso2 = c("FR", "BE", "NL", "DE", "IT",
"LU", "DK", "IE", "GB", "GR",
"ES", "PT", "FI", "SE",
"AT", "CY", "CZ", "EE", "HU",
"LV", "LT", "MT", "PL", "SK",
"SI", "BG", "RO", "HR"),
stringsAsFactors = FALSE)
# create a mapping for country name standardization: the Czech Republic is a problem
country_name_mapping <- c("Czechia" = "Czech Republic")
# 1. vdemdata ----------------------------------------------------------------
# install the package from GitHub first:
# devtools::install_github("vdeminstitute/vdemdata")
vdem_data <- vdem
# apply country name standardization before filtering
vdem_data_standardized <- vdem_data %>%
mutate(
# Standardize country names
country_name = ifelse(country_name %in% names(country_name_mapping),
country_name_mapping[country_name],
country_name))
# filter for most recent data (2019 to match survey data)
vdem_2019 <- vdem_data_standardized %>%
filter(country_name %in% survey_country_mapping$country_name, year == 2019) %>%
select(country_name, v2x_libdem, v2x_polyarchy, v2x_gender,
v2x_egaldem, v2x_liberal, v2xcs_ccsi, v2x_freexp) # select relevant variables
saveRDS(vdem_2019, file = "vdem_eu_2019.rds")
write.csv(vdem_2019, "vdem_eu_2019.csv")
# 2. Rainbowmap --------------------------------------------------------------
# https://rainbowmap.ilga-europe.org/
# rainbow_data <- read_csv("data/raw/2024-rainbow-map-data.csv")
# but the problem: that's for 2024, not 2019 or before 2019
# hence, we would need to get the data for 2019 and years before:
# create data frames for each year
df_2019 <- data.frame(
Country = c("Malta", "Belgium", "United Kingdom", "Norway", "France", "Finland",
"Denmark", "Spain", "Portugal", "Sweden", "Netherlands", "Austria",
"Germany", "Croatia", "Greece", "Ireland", "Hungary", "Luxembourg",
"Iceland", "Slovenia", "Montenegro", "Estonia", "Switzerland", "Georgia",
"Bosnia & Herzegovina", "Slovakia", "Albania", "Serbia", "Bulgaria",
"Czechia", "Kosovo", "Andorra", "Cyprus", "Romania", "Ukraine",
"Lithuania", "Italy", "Poland", "Latvia", "Belarus", "Moldova",
"Russia", "North Macedonia", "Liechtenstein", "San Marino", "Armenia",
"Turkey", "Monaco", "Azerbaijan"),
Value = c(74.72, 70.40, 67.62, 63.64, 62.73, 62.20, 60.31, 58.49, 57.50, 52.86,
49.72, 48.54, 47.45, 45.83, 44.82, 44.72, 42.83, 41.34, 40.20, 39.82,
37.16, 34.38, 30.06, 29.49, 29.37, 29.04, 27.74, 26.43, 25.75, 25.50,
25.34, 23.93, 22.03, 20.67, 19.97, 19.91, 19.43, 18.10, 16.83, 15.82,
15.10, 11.75, 10.88, 10.08, 9.87, 8.50, 8.50, 7.31, 5.67),
Year = 2019)
df_2018 <- data.frame(
Country = c("Malta", "Belgium", "United Kingdom", "Finland", "France", "Norway",
"Portugal", "Denmark", "Spain", "Sweden", "Netherlands", "Germany",
"Austria", "Greece", "Ireland", "Croatia", "Slovenia", "Luxembourg",
"Iceland", "Hungary", "Estonia", "Switzerland", "Montenegro", "Andorra",
"Albania", "Kosovo", "Bosnia & Herzegovina", "Serbia", "Czechia",
"Cyprus", "Slovakia", "Italy", "Georgia", "Bulgaria", "Lithuania",
"Romania", "Ukraine", "Poland", "Liechtenstein", "Latvia",
"North Macedonia", "Belarus", "Moldova", "San Marino", "Russia",
"Monaco", "Turkey", "Armenia", "Azerbaijan"),
Value = c(91.94, 78.70, 73.48, 73.27, 72.81, 72.74, 69.16, 67.69, 67.03, 60.10,
59.64, 59.00, 56.40, 52.32, 52.22, 50.58, 47.73, 47.48, 47.22, 47.16,
39.34, 38.44, 37.74, 34.81, 33.24, 32.98, 31.38, 29.68, 29.20, 28.95,
28.65, 28.82, 25.87, 24.15, 21.78, 21.12, 20.95, 18.23, 17.87, 16.07,
14.03, 13.35, 13.08, 12.32, 10.80, 9.78, 8.60, 7.20, 4.70),
Year = 2018)
df_2017 <- data.frame(
Country = c("Malta", "Norway", "United Kingdom", "Belgium", "France", "Portugal",
"Finland", "Denmark", "Spain", "Netherlands", "Croatia", "Sweden",
"Austria", "Germany", "Ireland", "Iceland", "Greece", "Luxembourg",
"Hungary", "Slovenia", "Montenegro", "Andorra", "Estonia", "Albania",
"Bosnia & Herzegovina", "Switzerland", "Kosovo", "Serbia", "Czechia",
"Cyprus", "Slovakia", "Italy", "Georgia", "Bulgaria", "Romania",
"Ukraine", "Poland", "Liechtenstein", "Lithuania", "Latvia",
"North Macedonia", "Belarus", "Moldova", "San Marino", "Monaco",
"Turkey", "Armenia", "Azerbaijan"),
Value = c(77.74, 75.73, 71.86, 70.82, 69.16, 68.27, 67.69, 67.03, 64.44, 62.36,
60.10, 55.58, 54.41, 52.22, 47.22, 46.92, 46.48, 44.82, 44.28, 38.64,
34.81, 33.31, 33.24, 31.34, 30.94, 30.48, 29.68, 29.20, 28.95, 27.60,
26.67, 25.87, 23.15, 21.12, 19.00, 18.23, 17.87, 17.28, 17.12, 16.03,
13.35, 13.08, 12.32, 9.78, 8.60, 7.20, 6.40, 4.70),
Year = 2017)
df_2016 <- data.frame(
Country = c("Malta", "Belgium", "United Kingdom", "Spain", "Denmark", "Portugal",
"Finland", "France", "Croatia", "Netherlands", "Norway", "Sweden",
"Austria", "Iceland", "Greece", "Germany", "Ireland", "Hungary",
"Luxembourg", "Montenegro", "Estonia", "Albania", "Switzerland",
"Andorra", "Serbia", "Cyprus", "Slovenia", "Czechia", "Kosovo",
"Georgia", "Bosnia & Herzegovina", "Slovakia", "Bulgaria", "Romania",
"Italy", "Poland", "Liechtenstein", "Lithuania", "Latvia",
"North Macedonia", "San Marino", "Moldova", "Belarus", "Monaco",
"Turkey", "Armenia", "Azerbaijan"),
Value = c(77.75, 81.85, 79.19, 70.95, 70.90, 69.55, 67.25, 66.60, 66.55, 66.10,
65.15, 64.85, 62.21, 59.00, 58.30, 55.14, 54.70, 51.40, 50.35, 45.20,
38.25, 34.40, 33.15, 32.10, 32.00, 31.95, 31.65, 31.60, 31.55, 30.35,
29.40, 29.20, 24.00, 23.45, 19.75, 18.30, 18.20, 18.10, 17.35, 15.55,
14.40, 14.15, 13.35, 10.80, 8.75, 7.20, 4.85),
Year = 2016)
df_2015 <- data.frame(
Country = c("United Kingdom", "Belgium", "Malta", "Sweden", "Croatia", "Netherlands",
"Norway", "Spain", "Denmark", "Portugal", "France", "Iceland", "Finland",
"Germany", "Austria", "Hungary", "Montenegro", "Luxembourg", "Albania",
"Ireland", "Greece", "Georgia", "Czechia", "Estonia", "Slovenia",
"Andorra", "Bosnia & Herzegovina", "Serbia", "Slovakia", "Romania",
"Switzerland", "Bulgaria", "Poland", "Italy", "Liechtenstein",
"Lithuania", "Cyprus", "Kosovo", "Latvia", "Moldova", "Belarus",
"San Marino", "North Macedonia", "Turkey", "Monaco", "Armenia",
"Azerbaijan"),
Value = c(88.00, 83.00, 77.00, 72.00, 71.00, 69.00, 69.00, 69.00, 68.00, 67.00,
65.00, 63.00, 62.00, 56.00, 52.00, 50.00, 46.00, 43.00, 42.00, 40.00,
39.00, 36.00, 35.00, 34.00, 32.00, 31.00, 29.00, 29.00, 29.00, 28.00,
28.00, 27.00, 26.00, 22.00, 19.00, 19.00, 18.00, 18.00, 18.00, 16.00,
14.00, 14.00, 13.00, 12.00, 11.00, 9.00, 5.00),
Year = 2015)
df_2014 <- data.frame(
Country = c("United Kingdom", "Belgium", "Spain", "Netherlands", "Norway",
"Portugal", "Sweden", "France", "Iceland", "Denmark", "Malta",
"Croatia", "Germany", "Hungary", "Austria", "Montenegro", "Finland",
"Albania", "Slovenia", "Czechia", "Estonia", "Ireland", "Greece",
"Slovakia", "Serbia", "Bulgaria", "Switzerland", "Luxembourg",
"Romania", "Poland", "Italy", "Georgia", "Lithuania", "Andorra",
"Bosnia & Herzegovina", "Cyprus", "Latvia", "Liechtenstein", "Kosovo",
"Moldova", "Turkey", "San Marino", "Belarus", "North Macedonia",
"Ukraine", "Monaco", "Armenia", "Azerbaijan"),
Value = c(80.25, 78.10, 73.26, 69.90, 68.40, 66.60, 65.30, 64.10, 63.95, 59.90,
56.80, 56.30, 55.68, 53.65, 52.10, 47.05, 45.30, 38.40, 35.00, 34.65,
34.65, 33.65, 31.15, 30.50, 30.30, 30.00, 28.85, 28.35, 27.95, 27.65,
27.40, 28.05, 21.70, 20.60, 20.10, 19.65, 19.65, 18.00, 17.10, 16.50,
14.15, 13.70, 13.60, 13.30, 11.65, 10.10, 8.50, 6.60),
Year = 2014)
df_2013 <- data.frame(
Country = c("United Kingdom", "Belgium", "Norway", "Sweden", "Spain", "Portugal",
"France", "Netherlands", "Denmark", "Iceland", "Hungary", "Germany",
"Croatia", "Finland", "Austria", "Albania", "Malta", "Slovenia",
"Czechia", "Ireland", "Romania", "Estonia", "Switzerland", "Luxembourg",
"Greece", "Slovakia", "Montenegro", "Serbia", "Poland", "Georgia",
"Lithuania", "Andorra", "Bosnia & Herzegovina", "Cyprus", "Latvia",
"Italy", "Bulgaria", "Liechtenstein", "Turkey", "San Marino", "Belarus",
"Kosovo", "North Macedonia", "Ukraine", "Monaco", "Armenia", "Azerbaijan"),
Value = c(78.50, 68.73, 65.65, 65.30, 65.04, 64.60, 64.10, 60.00, 59.80, 55.50,
54.70, 54.29, 48.30, 47.25, 43.35, 38.40, 35.30, 35.00, 34.65, 33.65,
31.30, 28.90, 28.85, 28.35, 28.10, 28.90, 28.65, 25.05, 21.65, 21.05,
20.70, 20.60, 19.95, 19.65, 19.65, 19.40, 18.00, 15.50, 14.15, 13.70,
13.60, 13.50, 13.30, 11.65, 10.10, 7.50, 7.10),
Year = 2013)
# combine all data frames into one
df_combined <- bind_rows(df_2019, df_2018, df_2017, df_2016, df_2015, df_2014, df_2013)
# create new, compressed df
# step 1: Filter data for 2019 and 2018
df_2019 <- df_combined %>% filter(Year == 2019) %>% rename(Value_2019 = Value)
df_2018 <- df_combined %>% filter(Year == 2018) %>% rename(Value_2018 = Value)
# step 2: Filter data for 2013 and 2014 and calculate the average
df_2013_2014 <- df_combined %>% filter(Year %in% c(2013, 2014)) %>%
group_by(Country) %>%
summarise(Avg_2013_2014 = mean(Value, na.rm = TRUE))
# step 3: Join the data frames for 2019 and 2018
df_compressed <- df_2019 %>%
left_join(df_2018, by = "Country") %>%
select(Country, Value_2019, Value_2018)
# step 4: Calculate the average for 2019 and 2018
df_compressed <- df_compressed %>%
mutate(Avg_2019_2018 = (Value_2019 + Value_2018) / 2)
# step 5: Join the average for 2013 and 2014
df_compressed <- df_compressed %>%
left_join(df_2013_2014, by = "Country")
# step 6: Calculate the difference between the averages
df_compressed <- df_compressed %>%
mutate(Difference = Avg_2019_2018 - Avg_2013_2014)
# step 7: Select and reorder columns for the final compressed data frame
df_compressed <- df_compressed %>%
select(Country, Value_2019, Value_2018, Avg_2019_2018,
Avg_2013_2014, Difference)
rainbow_df <- df_compressed
# save the data frame
saveRDS(rainbow_df, file = "rainbow_df.rds")
write.csv(rainbow_df, "rainbow_df.csv")
# 3. The Economist: Democracy scores 2018 ---------------------------------
# https://enperspectiva.uy/wp-content/uploads/2019/01/Democracy_Index_2018.pdf
democracy_scores <- data.frame(
Country = c("Belgium", "Denmark", "Greece", "Spain", "Finland", "France", "Ireland", "Italy", "Luxembourg", "Netherlands", "Austria", "Portugal", "Sweden", "Germany", "United Kingdom", "Bulgaria", "Cyprus", "Czech Republic", "Estonia", "Hungary", "Latvia", "Lithuania", "Malta", "Poland", "Romania", "Slovakia", "Slovenia", "Croatia"),
ISO2 = c("BE", "DK", "GR", "ES", "FI", "FR", "IE", "IT", "LU", "NL", "AT", "PT", "SE", "DE", "GB", "BG", "CY", "CZ", "EE", "HU", "LV", "LT", "MT", "PL", "RO", "SK", "SI", "HR"),
Overall_score = c(7.78, 9.22, 7.29, 8.08, 9.14, 7.80, 9.15, 7.71, 8.81, 8.89, 8.29, 7.84, 9.39, 8.68, 8.53, 7.03, 7.59, 7.69, 7.97, 6.63, 7.38, 7.50, 8.21, 6.67, 6.38, 7.10, 7.50, 6.57),
#Global_rank = c(31, 5, 39, 19, 8, 29, "6=", 33, 12, 11, 16, 27, 3, 13, 14, 46, 35, 34, "23=", "57=", 38, "36=", 18, "54=", "66=", 44, "36=", 60),
#Regional_rank = c(17, 4, 20, 14, 6, 16, 5, 18, 9, 8, 12, 15, 3, 10, 11, 7, 19, 2, 1, 9, 5, "3=", 13, 8, 12, 6, "3=", 10),
Electoral_process_and_pluralism = c(9.58, 10.00, 9.58, 9.17, 10.00, 9.58, 9.58, 9.58, 10.00, 9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.17, 9.17, 9.58, 9.58, 8.75, 9.58, 9.58, 9.17, 9.17, 9.17, 9.58, 9.58, 9.17),
Functioning_of_government = c(8.93, 9.29, 5.36, 7.14, 8.93, 7.50, 7.86, 6.07, 8.93, 9.29, 7.86, 7.50, 9.64, 8.57, 7.50, 6.43, 6.43, 6.79, 8.21, 6.07, 6.07, 6.43, 8.21, 6.07, 5.71, 6.79, 6.79, 6.07),
Political_participation = c(5.00, 8.33, 6.11, 7.78, 8.33, 7.78, 8.33, 7.78, 6.67, 8.33, 8.33, 6.11, 8.33, 8.33, 8.33, 7.22, 6.67, 6.67, 6.67, 5.00, 5.56, 6.11, 6.11, 6.11, 5.00, 5.56, 6.67, 5.56),
Political_culture = c(6.88, 9.38, 6.88, 7.50, 8.75, 5.63, 10.00, 6.88, 8.75, 8.13, 6.88, 6.88, 10.00, 7.50, 8.13, 4.38, 6.88, 6.88, 6.88, 6.25, 6.88, 6.25, 8.75, 4.38, 4.38, 5.63, 6.25, 5.00),
Civil_liberties = c(8.53, 9.12, 8.53, 8.82, 9.71, 8.53, 10.00, 8.24, 9.71, 9.12, 8.82, 9.12, 9.41, 9.41, 9.12, 7.94, 8.82, 8.53, 8.53, 7.06, 8.82, 9.12, 8.82, 7.65, 7.65, 7.94, 8.24, 7.06),
Regime_type = c("Flawed democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Flawed democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Full democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Full democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy"))
saveRDS(democracy_scores, file = "democracy_scores.rds")
write.csv(democracy_scores, "democracy_scores.csv")
# 4. Happiness data 2018 ---------------------------------------------------------------------
# https://s3.amazonaws.com/happiness-report/2018/WHR_web.pdf
happiness_scores <- data.frame(
Country = c("Finland", "Denmark", "Greece", "Spain", "France", "Ireland", "Italy", "Luxembourg", "Netherlands", "Austria", "Portugal", "Sweden", "Germany", "United Kingdom", "Bulgaria", "Cyprus", "Czech Republic", "Estonia", "Hungary", "Latvia", "Lithuania", "Malta", "Poland", "Romania", "Slovakia", "Slovenia", "Croatia"),
ISO2 = c("FI", "DK", "GR", "ES", "FR", "IE", "IT", "LU", "NL", "AT", "PT", "SE", "DE", "GB", "BG", "CY", "CZ", "EE", "HU", "LV", "LT", "MT", "PL", "RO", "SK", "SI", "HR"),
Happiness_Score = c(7.632, 7.555, 5.358, 6.310, 6.489, 6.977, 6.000, 6.910, 7.441, 7.139, 5.410, 7.314, 6.965, 6.814, 4.933, 5.762, 6.711, 5.739, 5.620, 5.933, 5.952, 6.627, 6.123, 5.945, 6.173, 5.948, 5.321))
saveRDS(happiness_scores, file = "happiness_scores.rds")
write.csv(happiness_scores, "happiness_scores.csv")
# 5. GDP per capita ---------------------------------------------------------------------
df_GDP <- read_csv("data/raw/data_20250228194704.csv")
## New names:
## Rows: 1064 Columns: 10
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): IndicatorName, VariableName, MeasurementName, CountryCode, Alpha3Co... dbl
## (2): IndicatorCode, PeriodCode lgl (1): ...10
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...10`
# apply the mapping to the CountryName column
df_GDP <- df_GDP %>%
mutate(
# replace Czechia with Czech Republic
CountryName = ifelse(CountryName %in% names(country_name_mapping),
country_name_mapping[CountryName],
CountryName)) %>%
select("CountryName", "PeriodCode", "Value") %>%
# now filter after standardizing the names
filter(CountryName %in% survey_country_mapping$country_name)
df_GDP <- df_GDP %>%
mutate(Value = as.numeric(as.character(Value)))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Value = as.numeric(as.character(Value))`.
## Caused by warning:
## ! NAs introduced by coercion
df_GDP <- df_GDP %>%
# group by country
group_by(CountryName) %>%
# find first and last year values
summarize(
gdp_2005 = Value[PeriodCode == 2005],
gdp_2018 = Value[PeriodCode == 2018],
# calculate relative growth
gdp_growth = (gdp_2018 - gdp_2005) / gdp_2005 * 100) %>%
# add ISO2 codes for easier joining with other datasets
left_join(survey_country_mapping, by = c("CountryName" = "country_name")) %>%
# select relevant columns
select(CountryName, iso2, gdp_2005, gdp_2018, gdp_growth)
saveRDS(df_GDP, file = "df_GDP.rds")
write.csv(df_GDP, "df_GDP.csv")
# 7. Unemployment rate ----------------------------------------------------
# https://en.wikipedia.org/wiki/List_of_European_Union_member_states_by_unemployment_rate
# scrape data from Wikipedia
url <- "https://en.wikipedia.org/wiki/List_of_European_Union_member_states_by_unemployment_rate"
page <- read_html(url)
tables <- html_table(page, fill = TRUE)
eu_unemployment_table <- tables[[1]]
# clean column names - simplify them to more standard names
colnames(eu_unemployment_table) <- c("Country", "Unemployment", "Employment", "Year")
# clean up the country names by removing footnote references
eu_unemployment_table$Country <- gsub("\\[.*?\\]", "", eu_unemployment_table$Country)
# make sure numeric columns are properly formatted
eu_unemployment_table$Unemployment <- as.character(eu_unemployment_table$Unemployment)
eu_unemployment_table$Employment <- as.numeric(as.character(eu_unemployment_table$Employment))
eu_unemployment_table$Year <- as.numeric(as.character(eu_unemployment_table$Year))
eu_unemployment_table <- eu_unemployment_table %>%
mutate(
# trim spaces from country names
Country = trimws(Country),
# convert Unemployment to numeric (remove any % signs or spaces if present)
Unemployment = as.numeric(gsub("[^0-9.]", "", Unemployment)))
# modify Greece's unemployment rate to the 2018 value
eu_unemployment_table$Unemployment[eu_unemployment_table$Country == "Greece"] <- 19.18
eu_unemployment_table$Year[eu_unemployment_table$Country == "Greece"] <- 2018
# add the UK data manually
uk_data <- data.frame(
Country = "United Kingdom",
Unemployment = 4.12, # from Statista
Employment = 75.25, # estimated from https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/employmentandemployeetypes/articles/singlemonthlabourforcesurveyestimates/december2018
Year = 2018)
# append UK data to the table
eu_unemployment_table <- rbind(eu_unemployment_table, uk_data)
# ignore the year column
eu_unemployment_table <- eu_unemployment_table %>%
select(-Year)
# save
saveRDS(eu_unemployment_table, file = "eu_unemployment_table.rds")
write.csv(eu_unemployment_table, "eu_unemployment_table.csv", row.names = FALSE)
# 8. Gender equality ------------------------------------------------------
# https://eige.europa.eu/gender-statistics/dgs/indicator/index__index_scores/datatable?time=2017&col=domain&row=geo
gender_eq_index <- read_xlsx("data/raw/index__index_scores.xlsx", range = "A16:V44")
gender_eq_index <- gender_eq_index %>%
select(country_name = "Geographic region\\(Sub-) Domain Scores",
gender_equality_index = "Overall Gender Equality Index") %>%
mutate(country_name = ifelse(country_name == "Czechia", "Czech Republic", country_name))
# Merge the data ----------------------------------------------------------
# create a base dataframe with country identifying variables
country_level_df <- survey_country_mapping
# merge V-Dem data
country_level_df <- country_level_df %>%
left_join(vdem_2019, by = c("country_name" = "country_name"))
# fix country name in democracy_scores for Czech Republic if needed
if(any(democracy_scores$Country == "Czechia")) {
democracy_scores$Country[democracy_scores$Country == "Czechia"] <- "Czech Republic"
}
# merge democracy scores
country_level_df <- country_level_df %>%
left_join(democracy_scores, by = c("iso2" = "ISO2"))
# merge GDP data
country_level_df <- country_level_df %>%
left_join(df_GDP, by = "iso2")
# rename some countries to match our country_name format
rainbow_country_mapping <- data.frame(
original = c("Czechia", "Andorra", "Bosnia & Herzegovina", "North Macedonia", "United Kingdom"),
standardized = c("Czech Republic", "Andorra", "Bosnia and Herzegovina", "Macedonia", "United Kingdom"),
stringsAsFactors = FALSE)
# apply standardized country names
for(i in 1:nrow(rainbow_country_mapping)) {
rainbow_df$Country[rainbow_df$Country == rainbow_country_mapping$original[i]] <-
rainbow_country_mapping$standardized[i]
}
# extract and rename rainbow map variables for clarity
rainbow_data_clean <- rainbow_df %>%
select(
Country,
rainbow_score_2019 = Value_2019,
rainbow_score_2018 = Value_2018,
rainbow_score_avg_2019_2018 = Avg_2019_2018,
rainbow_score_avg_2013_2014 = Avg_2013_2014,
rainbow_score_difference = Difference)
# merge Rainbow Map data
country_level_df <- country_level_df %>%
left_join(rainbow_data_clean, by = c("country_name" = "Country"))
# merge happiness data
country_level_df <- country_level_df %>%
left_join(happiness_scores, by = c("iso2" = "ISO2"))
# merge unemployment data
country_level_df <- country_level_df %>%
left_join(eu_unemployment_table, by = c("country_name" = "Country"))
# create a mapping between ESS country codes and ISO2
ess_country_mapping <- data.frame(
cntry = c("AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR",
"DE", "HU", "IE", "IT", "LV", "LT", "NL", "PL", "PT", "RO",
"SK", "SI", "ES", "SE", "GB"),
iso2 = c("AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR",
"DE", "HU", "IE", "IT", "LV", "LT", "NL", "PL", "PT", "RO",
"SK", "SI", "ES", "SE", "GB"),
stringsAsFactors = FALSE)
# first ensure cntry codes match our iso2 codes
country_data_final <- country_data_final %>%
left_join(ess_country_mapping, by = "cntry") %>%
select(-iso2c, -iso3c) # remove original ISO codes to avoid confusion
# rename variables for clarity
ess_data_clean <- country_data_final %>%
select(
cntry,
n_respondents,
n_valid,
lgbt_support_percent = pct_lgbt_support,
mean_religiosity,
mean_left_right,
mean_equal_values,
mean_country_attach,
mean_eduyrs,
mean_age,
pct_young,
pct_high_educ,
se_lgbt_support,
pct_missing_lgbt,
age_lgbt_corr,
relig_lgbt_corr,
lgbt_support_inequality,
educ_gradient,
#iso3c
)
# merge ESS data
country_level_df <- country_level_df %>%
left_join(ess_data_clean, by = c("iso2" = "cntry"))
country_level_df %>% View()
# delete the first Greece row
greece_rows <- which(country_level_df$country_name == "Greece")
if (length(greece_rows) > 1) {
# remove the first instance of Greece
country_level_df <- country_level_df[-greece_rows[1], ]
# verify the fix worked
greece_check <- country_level_df %>%
filter(country_name == "Greece")
print("Greece entries after removing the first instance:")
print(greece_check)
}
# add the gender equality index
country_level_df <- country_level_df %>%
left_join(gender_eq_index, by = "country_name")
We’ll apply different scaling methods based on the type of variable:
# custom function to standardize (z-score)
standardize <- function(x) {
(x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}
# custom function to min-max normalize
normalize <- function(x) {
(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}
# z-scores for the democracy scores
country_level_df <- country_level_df %>%
mutate(
z_v2x_libdem = standardize(v2x_libdem),
z_v2x_polyarchy = standardize(v2x_polyarchy),
z_v2x_gender = standardize(v2x_gender),
z_v2x_egaldem = standardize(v2x_egaldem),
z_v2x_liberal = standardize(v2x_liberal),
z_v2xcs_ccsi = standardize(v2xcs_ccsi),
z_v2x_freexp = standardize(v2x_freexp))
# z-scores for GDP and unemployment rate, natural log for GDP
country_level_df <- country_level_df %>%
mutate(
z_gdp_2018 = standardize(gdp_2018),
log_gdp_2018 = log(gdp_2018),
z_gdp_growth = standardize(gdp_growth),
z_unemployment = standardize(Unemployment))
# z-scores and normalised rainbow variables
country_level_df <- country_level_df %>%
mutate(
z_rainbow_score = standardize(rainbow_score_2019),
norm_rainbow_score = normalize(rainbow_score_2019),
z_lgbt_support = standardize(lgbt_support_percent),
norm_lgbt_support = normalize(lgbt_support_percent))
# z-socres and normalised for happiness scores and gender equality
country_level_df <- country_level_df %>%
mutate(
z_happiness = standardize(Happiness_Score),
z_gender_equality = standardize(gender_equality_index),
norm_gender_equality = normalize(gender_equality_index))
# religion, politics, education and age
country_level_df <- country_level_df %>%
mutate(
z_religiosity = standardize(mean_religiosity),
z_functioning_of_government = standardize(Functioning_of_government),
z_left_right = standardize(mean_left_right),
z_equal_values = standardize(mean_equal_values),
z_country_attach = standardize(mean_country_attach),
z_eduyrs = standardize(mean_eduyrs),
z_age = standardize(mean_age))
# create two composite scores and regional classification
country_level_df <- country_level_df %>%
mutate(
composite_equality = rowMeans(
cbind(z_gender_equality, z_rainbow_score, z_v2x_gender),
na.rm = TRUE))
country_level_df <- country_level_df %>%
mutate(
composite_democracy = rowMeans(
cbind(z_v2x_libdem, z_v2x_polyarchy, z_v2x_libdem, z_v2x_freexp),
na.rm = TRUE))
country_level_df <- country_level_df %>%
mutate(
region = case_when(
country_name %in% c("Denmark", "Finland", "Sweden", "Estonia", "Latvia", "Lithuania") ~
"Northern Europe",
country_name %in% c("Belgium", "Netherlands", "Luxembourg", "Germany", "France",
"Austria", "United Kingdom", "Ireland") ~
"Western Europe",
country_name %in% c("Portugal", "Spain", "Italy", "Malta", "Greece", "Cyprus") ~
"Southern Europe",
country_name %in% c("Poland", "Czech Republic", "Slovakia", "Hungary", "Slovenia",
"Croatia", "Romania", "Bulgaria") ~ "Eastern Europe",
TRUE ~ NA_character_))
# delete the "Country.x", "Country.y" and "CountryName" columns
country_level_df <- country_level_df %>%
select(-c("Country.x", "Country.y", "CountryName"))
# save as RDS and csv
saveRDS(country_level_df, file = "country_level_df.rds")
write.csv(country_level_df, "country_level_df.csv")
data <- read_dta("data/raw/ZA7575.dta")
# create dataset with non-response indicator
non_numeric_vars <- c("qc19", "studyno1", "studyno2", "doi", "version",
"edition", "survey", "caseid", "uniqid", "serialid",
"tnscntry", "country")
# use the data that is already correctly coded
data_correctly_coded_nonresp_analysis <- data %>%
mutate(
# hard cases for me
d72_1 = ifelse(d72_1 %in% c(5,6), NA, d72_1),
d72_2 = ifelse(d72_2 %in% c(5,6), NA, d72_2),
d60 = ifelse(d60 == 7, NA, d60),
d25 = ifelse(d25 == 8, NA, d25),
d8 = ifelse(d8 > 70, NA, d8), # subjective decision that no one can have more than 70 years of ed (even full-time professors)
d7 = ifelse(d7 %in% c(15,97), NA, d7),
d1 = ifelse(d1 %in% c(97,98), NA, d1),
qa16_1 = ifelse(qa16_1 == 5, NA, qa16_1),
qa16_2 = ifelse(qa16_2 == 5, NA, qa16_2),
qa16_3 = ifelse(qa16_3 == 5, NA, qa16_3),
qa16_4 = ifelse(qa16_4 == 5, NA, qa16_4),
d71_1 = ifelse(d71_1 == 4, NA, d71_1),
d71_2 = ifelse(d71_2 == 4, NA, d71_2),
d71_3 = ifelse(d71_3 == 4, NA, d71_3),
#qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
#qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
#qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
#qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
qb3_1 = ifelse(qb3_1 == 5, NA, qb3_1),
qb3_2 = ifelse(qb3_2 == 5, NA, qb3_2),
qb3_3 = ifelse(qb3_3 == 5, NA, qb3_3),
qb3_4 = ifelse(qb3_4 == 5, NA, qb3_4),
qb3_5 = ifelse(qb3_5 == 5, NA, qb3_5),
qb3_6 = ifelse(qb3_6 == 5, NA, qb3_6),
qb3_7 = ifelse(qb3_7 == 5, NA, qb3_7),
qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
qb4_5 = ifelse(qb4_5 == 5, NA, qb4_5),
qb5_1 = ifelse(qb5_1 == 5, NA, qb5_1),
qb5_2 = ifelse(qb5_2 == 5, NA, qb5_2),
qb5_3 = ifelse(qb5_3 == 5, NA, qb5_3),
qb5_4 = ifelse(qb5_4 == 5, NA, qb5_4),
sd1_1 = ifelse(sd1_1 %in% c(3,4), NA, sd1_1),
sd1_2 = ifelse(sd1_2 %in% c(3,4), NA, sd1_2),
sd1_3 = ifelse(sd1_3 %in% c(3,4), NA, sd1_3),
sd1_4 = ifelse(sd1_4 %in% c(3,4), NA, sd1_4),
sd1_5 = ifelse(sd1_5 %in% c(3,4), NA, sd1_5),
sd1_6 = ifelse(sd1_6 %in% c(3,4), NA, sd1_6),
sd1_7 = ifelse(sd1_7 %in% c(3,4), NA, sd1_7),
sd1_8 = ifelse(sd1_8 %in% c(3,4), NA, sd1_8),
qc1_1 = ifelse(qc1_1 == 6, NA, qc1_1),
qc1_2 = ifelse(qc1_2 == 6, NA, qc1_2),
qc1_3 = ifelse(qc1_3 == 6, NA, qc1_3),
qc1_4 = ifelse(qc1_4 == 6, NA, qc1_4),
qc1_5 = ifelse(qc1_5 == 6, NA, qc1_5),
qc1_6 = ifelse(qc1_6 == 6, NA, qc1_6),
qc1_7 = ifelse(qc1_7 == 6, NA, qc1_7),
qc1_8 = ifelse(qc1_8 == 6, NA, qc1_8),
qc1_9 = ifelse(qc1_9 == 6, NA, qc1_9),
qc1_10 = ifelse(qc1_10 == 6, NA, qc1_10),
qc5_1 = ifelse(qc5_1 == 3, NA, qc5_1),
qc5_2 = ifelse(qc5_2 == 3, NA, qc5_2),
qc5_3 = ifelse(qc5_3 == 3, NA, qc5_3),
qc5_4 = ifelse(qc5_4 == 3, NA, qc5_4),
qc6_1 = ifelse(qc6_1 == 12, NA, qc6_1),
qc6_2 = ifelse(qc6_2 == 12, NA, qc6_2),
qc6_3 = ifelse(qc6_3 == 12, NA, qc6_3),
qc6_4 = ifelse(qc6_4 == 12, NA, qc6_4),
qc6_5 = ifelse(qc6_5 == 12, NA, qc6_5),
qc6_6 = ifelse(qc6_6 == 12, NA, qc6_6),
qc6_7 = ifelse(qc6_7 == 12, NA, qc6_7),
qc6_8 = ifelse(qc6_8 == 12, NA, qc6_8),
qc6_9 = ifelse(qc6_9 == 12, NA, qc6_9),
qc6_10 = ifelse(qc6_10 == 12, NA, qc6_10),
qc6_11 = ifelse(qc6_11 == 12, NA, qc6_11),
qc9_1 = ifelse(qc9_1 == 7, NA, qc9_1),
qc9_2 = ifelse(qc9_2 == 7, NA, qc9_2),
qc9_3 = ifelse(qc9_3 == 7, NA, qc9_3),
qc9_4 = ifelse(qc9_4 == 7, NA, qc9_4),
qc9_5 = ifelse(qc9_5 == 7, NA, qc9_5),
qc9_6 = ifelse(qc9_6 == 7, NA, qc9_6),
qc9_7 = ifelse(qc9_7 == 7, NA, qc9_7),
qc9_8 = ifelse(qc9_8 == 7, NA, qc9_8),
qc9_9 = ifelse(qc9_9 == 7, NA, qc9_9),
qc9_10 = ifelse(qc9_10 == 7, NA, qc9_10),
qc9_11 = ifelse(qc9_11 == 7, NA, qc9_11),
qc11_1 = ifelse(qc11_1 == 6, NA, qc11_1),
qc11_2 = ifelse(qc11_2 == 6, NA, qc11_2),
qc11_3 = ifelse(qc11_3 == 6, NA, qc11_3),
qc11_4 = ifelse(qc11_4 == 6, NA, qc11_4),
qc11_5 = ifelse(qc11_5 == 6, NA, qc11_5),
qc11_6 = ifelse(qc11_6 == 6, NA, qc11_6),
qc12_1 = ifelse(qc12_1 %in% c(11,12,13), NA, qc12_1),
qc12_2 = ifelse(qc12_2 %in% c(11,12,13), NA, qc12_2),
qc12_3 = ifelse(qc12_3 %in% c(11,12,13), NA, qc12_3),
qc12_4 = ifelse(qc12_4 %in% c(11,12,13), NA, qc12_4),
qc12_5 = ifelse(qc12_5 %in% c(11,12,13), NA, qc12_5),
qc12_6 = ifelse(qc12_6 %in% c(11,12,13), NA, qc12_6),
qc12_7 = ifelse(qc12_7 %in% c(11,12,13), NA, qc12_7),
qc12_8 = ifelse(qc12_8 %in% c(11,12,13), NA, qc12_8),
qc12_9 = ifelse(qc12_9 %in% c(11,12,13), NA, qc12_9),
qc12_10 = ifelse(qc12_10 %in% c(11,12,13), NA, qc12_10),
qc12_11 = ifelse(qc12_11 %in% c(11,12,13), NA, qc12_11),
qc12_12 = ifelse(qc12_12 %in% c(11,12,13), NA, qc12_12),
qc12_13 = ifelse(qc12_13 %in% c(11,12,13), NA, qc12_13),
qc12_14 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_14),
qc12_15 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_15),
qc13_1 = ifelse(qc13_1 %in% c(11,12,13), NA, qc13_1),
qc13_2 = ifelse(qc13_2 %in% c(11,12,13), NA, qc13_2),
qc13_3 = ifelse(qc13_3 %in% c(11,12,13), NA, qc13_3),
qc13_4 = ifelse(qc13_4 %in% c(11,12,13), NA, qc13_4),
qc13_5 = ifelse(qc13_5 %in% c(11,12,13), NA, qc13_5),
qc13_6 = ifelse(qc13_6 %in% c(11,12,13), NA, qc13_6),
qc13_7 = ifelse(qc13_7 %in% c(11,12,13), NA, qc13_7),
qc13_8 = ifelse(qc13_8 %in% c(11,12,13), NA, qc13_8),
qc13_9 = ifelse(qc13_9 %in% c(11,12,13), NA, qc13_9),
qc13_10 = ifelse(qc13_10 %in% c(11,12,13), NA, qc13_10),
qc13_11 = ifelse(qc13_11 %in% c(11,12,13), NA, qc13_11),
qc13_12 = ifelse(qc13_12 %in% c(11,12,13), NA, qc13_12),
qc13_13 = ifelse(qc13_13 %in% c(11,12,13), NA, qc13_13),
qc13_14 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_14),
qc13_15 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_15),
qc15_1 = ifelse(qc15_1 == 5, NA, qc15_1),
qc15_2 = ifelse(qc15_2 == 5, NA, qc15_2),
qc15_3 = ifelse(qc15_3 == 5, NA, qc15_3),
qc16_1 = ifelse(qc16_1 == 5, NA, qc16_1),
qc17_1 = ifelse(qc17_1 == 5, NA, qc17_1),
qc17_2 = ifelse(qc17_2 == 5, NA, qc17_2),
qc17_3 = ifelse(qc17_3 == 5, NA, qc17_3),
qc17_4 = ifelse(qc17_4 == 5, NA, qc17_4),
qc17_5 = ifelse(qc17_5 == 5, NA, qc17_5),
qc17_6 = ifelse(qc17_6 == 5, NA, qc17_6),
qc17_7 = ifelse(qc17_7 == 5, NA, qc17_7),
qc18_1 = ifelse(qc18_1 %in% c(11,12), NA, qc18_1),
qc18_2 = ifelse(qc18_2 %in% c(11,12), NA, qc18_2),
qc18_3 = ifelse(qc18_3 %in% c(11,12), NA, qc18_3),
sd3 = ifelse(sd3 %in% c(15,16), NA, sd3),
# easy cases for Claude
#q1 = ifelse(q1 %in% c(29,30), NA, q1),
qa1 = ifelse(qa1 == 5, NA, qa1),
qa7 = ifelse(qa7 == 5, NA, qa7),
qa8 = ifelse(qa8 == 4, NA, qa8),
qa9 = ifelse(qa9 == 6, NA, qa9),
qa14 = ifelse(qa14 == 6, NA, qa14),
qa17 = ifelse(qa17 == 5, NA, qa17),
qb6 = ifelse(qb6 == 4, NA, qb6),
qb7 = ifelse(qb7 == 5, NA, qb7),
qc19 = ifelse(qc19 == 3, NA, qc19),
qc20 = ifelse(qc20 == 3, NA, qc20),
d63 = ifelse(d63 %in% c(6,7,8,9), NA, d63), # manually because stupid Claude
d77 = ifelse(d77 == 5, NA, d77),
d70 = ifelse(d70 == 5, NA, d70),
qa4a = ifelse(qa4a %in% c(7,8), NA, qa4a),
qa5a = ifelse(qa5a %in% c(11,12,13), NA, qa5a),
qa11 = ifelse(qa11 %in% c(4,5), NA, qa11),
qa12 = ifelse(qa12 %in% c(5,6), NA, qa12),
qa13 = ifelse(qa13 %in% c(6,7), NA, qa13),
qa18a = ifelse(qa18a %in% c(7,8,9), NA, qa18a),
qb8 = ifelse(qb8 == 5, NA, qb8),
sd3 = ifelse(sd3 == 16, NA, sd3),
qc3 = ifelse(qc3 %in% c(10,11), NA, qc3),
qc7 = ifelse(qc7 %in% c(11,12), NA, qc7),
qc8 = ifelse(qc8 %in% c(11,12), NA, qc8),
qc10 = ifelse(qc10 %in% c(9,10), NA, qc10))
nonresp_analysis <- data_correctly_coded_nonresp_analysis %>%
mutate(nonresponse = ifelse(is.na(qc19), 1, 0)) %>%
select(-all_of(non_numeric_vars))
## calculate correlations with target's NAs
# start by comparing to countries
nonresp_by_country <- nonresp_analysis %>%
group_by(isocntry) %>%
summarise(
nonresp_count = sum(nonresponse),
total_resp = n(),
nonresp_pct = nonresp_count/total_resp*100)
# visualize country variation
ggplot(nonresp_by_country,
aes(x = reorder(isocntry, -nonresp_pct), y = nonresp_pct)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "Non-Response Rates by Country",
x = "Country Code",
y = "Non-Response Percentage")
# identify likely categorical variables (those with few unique values)
var_uniqueness <- sapply(nonresp_analysis, function(x) {
if(is.numeric(x)) {
length(unique(x))
} else {
NA
}
})
# consider variables with 10 or fewer unique values as potential categorical variables
likely_categorical <- names(var_uniqueness[!is.na(var_uniqueness) & var_uniqueness < 6])
likely_categorical <- setdiff(likely_categorical, c("nonresponse", "qc19"))
#
chi_square_results <- data.frame(variable = character(),
p_value = numeric(),
stringsAsFactors = FALSE)
for (var in likely_categorical) {
# Create contingency table
cont_table <- table(nonresp_analysis$nonresponse, nonresp_analysis[[var]])
# Calculate chi-square test
chi_test <- chisq.test(cont_table)
# Store result
chi_square_results <- rbind(chi_square_results,
data.frame(variable = var, p_value = chi_test$p.value))
}
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
# Show significant associations
chi_square_results %>%
filter(p_value < 0.05) %>%
arrange(p_value)
## variable p_value
## 1 q1_30 0.000000e+00
## 2 eu28 0.000000e+00
## 3 qc6_10r 2.044685e-290
## 4 qc7r 4.886901e-252
## 5 qc6_11r 1.407293e-250
## 6 qc6_2r 4.356071e-225
## 7 qc18_2r 1.612896e-203
## 8 qc18_3r 8.893443e-200
## 9 qc11_6 8.014728e-181
## 10 qc14r 1.431519e-177
## 11 qc11_4 4.128275e-164
## 12 qc11_1 3.882881e-163
## 13 qc11_2 2.545518e-162
## 14 qc11_3 3.055025e-159
## 15 qc11_5 4.986443e-158
## 16 qa5t_13 1.698969e-141
## 17 qa10_6 1.265817e-139
## 18 qc8r 2.025190e-138
## 19 qc4_18 3.460117e-132
## 20 qb2_2 2.610051e-115
## 21 qc6_4r 3.079972e-115
## 22 qc6_3r 2.904656e-114
## 23 qc18_1r 1.158141e-107
## 24 d62t 2.685434e-107
## 25 qc6_5r 1.268464e-106
## 26 qc6_8r 2.092497e-99
## 27 qc6_9r 2.593222e-98
## 28 d1r1 1.385848e-96
## 29 qc6_6r 3.672115e-88
## 30 qc6_7r 9.272870e-88
## 31 qb2_4 1.225998e-84
## 32 sd1_4 1.123999e-82
## 33 qb9_12 1.923772e-81
## 34 qa4t_8 2.088283e-79
## 35 qc15_3 2.277333e-77
## 36 qc15_2 1.051620e-73
## 37 qc6_1r 1.604390e-71
## 38 qa6_8 1.362904e-70
## 39 qb2_1 2.246904e-69
## 40 qb2_3 1.393748e-67
## 41 p5 9.708885e-67
## 42 qb1_12 7.900936e-63
## 43 q1_26 1.288093e-62
## 44 eu_nms12 3.913407e-62
## 45 qa5t1 3.665003e-61
## 46 sd1_2 6.049019e-59
## 47 euronz13 1.062676e-52
## 48 eu15 5.168859e-52
## 49 eu_nms13 5.168859e-52
## 50 qa18t_8 2.515989e-51
## 51 qb6 1.748491e-49
## 52 qc4t_2 8.497680e-48
## 53 opls 1.413440e-46
## 54 euronz15 1.372073e-45
## 55 euroz13 1.405807e-44
## 56 qc15_1 1.952760e-44
## 57 d11r1 2.265525e-44
## 58 qc4t_1 1.194079e-43
## 59 gen3 2.756845e-42
## 60 eu12_1 4.517978e-42
## 61 gen2 5.559090e-41
## 62 qa4b_8 9.351894e-41
## 63 qa18t_4 2.452999e-40
## 64 euroz15 1.183887e-37
## 65 eu_ac2 1.793942e-37
## 66 qc20 3.218264e-37
## 67 eu12_2 9.452635e-37
## 68 qc5_2 1.037711e-36
## 69 qc5_1 2.121594e-34
## 70 qa18t_9 2.096787e-33
## 71 qc4_1 6.531364e-33
## 72 qc4t_3 1.531475e-32
## 73 euronz16 2.178984e-32
## 74 qa5b_13 3.544533e-32
## 75 eu10 4.714907e-32
## 76 qa1 5.400879e-31
## 77 d77 1.413699e-30
## 78 eurnz18c 4.786919e-30
## 79 d43t 7.120921e-30
## 80 qc4_13 1.816966e-29
## 81 qc5_4 1.960980e-29
## 82 eu9 4.872873e-28
## 83 qb9_7 2.582998e-27
## 84 qa18t_5 5.199612e-27
## 85 qa3_7 1.647446e-26
## 86 eu6 1.765025e-26
## 87 qc4_14 2.845552e-26
## 88 qc2_16 5.917563e-26
## 89 euroz16 1.679575e-25
## 90 q1_1 2.475417e-25
## 91 qc4_4 4.637918e-25
## 92 d43b 5.058858e-25
## 93 qa5t_1 5.858261e-25
## 94 qa18b_9 1.041380e-24
## 95 qc4_5 1.271743e-24
## 96 qc2t2 2.459485e-24
## 97 d70 2.960622e-24
## 98 eu_nms10 3.100520e-24
## 99 qa4t_2 5.365959e-24
## 100 qc4_3 8.206178e-23
## 101 polintr 8.566328e-23
## 102 qc4_15 1.299652e-22
## 103 qa5t_2 4.202083e-22
## 104 d72_1 5.959396e-22
## 105 qc2t1 9.676312e-22
## 106 qc4_8 3.644844e-21
## 107 sd1_1 3.950729e-21
## 108 gen6 4.019052e-21
## 109 d71_1 5.836218e-21
## 110 sd1_7 7.422958e-21
## 111 qa5t_3 1.036522e-20
## 112 qc5_3 2.356009e-20
## 113 d40_d11 2.578091e-19
## 114 eurnz17a 3.034910e-19
## 115 d71_2 3.358979e-19
## 116 qc4_10 7.509064e-19
## 117 d72_2 1.712351e-18
## 118 eu25 3.108513e-18
## 119 eu_cc3 3.108513e-18
## 120 q1_10 4.665997e-18
## 121 qa18t_2 2.003105e-17
## 122 qb8 4.583134e-17
## 123 qb1_7 5.338023e-17
## 124 qc2t_2 6.837072e-17
## 125 d15a_r1 7.026568e-17
## 126 qb3_6 7.937370e-17
## 127 qa5t_7 1.481144e-16
## 128 qc4_9 2.086661e-16
## 129 sd2_9 9.372057e-16
## 130 qc17_3 1.118165e-15
## 131 qb1_11 2.203586e-15
## 132 qc17_2 3.325650e-15
## 133 qc2_3 3.824190e-15
## 134 qa10_2 4.623632e-15
## 135 qa8 6.209991e-15
## 136 qc2t_1 1.592234e-14
## 137 qb9_3 1.680271e-14
## 138 qc4_7 1.858493e-14
## 139 qc4_2 2.119543e-14
## 140 qb1_4 2.446559e-14
## 141 eurnz18b 2.531609e-14
## 142 euroz17 5.449408e-14
## 143 eurnz17b 5.449408e-14
## 144 q1_18 6.255103e-14
## 145 sd2_10 7.213815e-14
## 146 qb1_2 1.470240e-13
## 147 qa18b_8 2.733686e-13
## 148 qa12 3.053371e-13
## 149 qc17_5 3.382632e-13
## 150 qc17_4 4.134190e-13
## 151 qa4t_6 4.334440e-13
## 152 qa4t_3 4.975980e-13
## 153 sd1_6 8.092142e-13
## 154 qa5t2 8.925986e-13
## 155 qc2_15 1.017303e-12
## 156 q1_23 1.653697e-12
## 157 qc4_6 2.306592e-12
## 158 qb3_2 2.520115e-12
## 159 qb9_5 3.651558e-12
## 160 qb5_1 4.491434e-12
## 161 d40abc_r 4.522913e-12
## 162 qb3_4 7.085733e-12
## 163 qb3_1 7.955607e-12
## 164 qb4_2 8.896783e-12
## 165 qb9_8 1.167710e-11
## 166 qc17_6 1.590429e-11
## 167 sd1_8 1.752213e-11
## 168 qb1_8 1.957163e-11
## 169 d71_3 2.209437e-11
## 170 qa6_3 2.855182e-11
## 171 gen5 3.079030e-11
## 172 qb3_5 3.911969e-11
## 173 qb9_6 1.112472e-10
## 174 qb5_2 2.270824e-10
## 175 qb4_4 3.276973e-10
## 176 p7gr_r 3.949888e-10
## 177 q1_24 4.374818e-10
## 178 qa6_6 5.266917e-10
## 179 q1_16 7.943816e-10
## 180 euroz18 9.087967e-10
## 181 eurnz18a 9.087967e-10
## 182 qb3_3 9.446784e-10
## 183 qb3_7 1.435136e-09
## 184 qb1_5 1.864389e-09
## 185 qa18b_5 2.440696e-09
## 186 qa10_3 3.507219e-09
## 187 qa5t_6 3.992039e-09
## 188 sd2_5 4.884312e-09
## 189 qc17_1 7.380343e-09
## 190 qa6_1 7.512216e-09
## 191 qb7 8.344996e-09
## 192 qb1_1 8.450203e-09
## 193 qc2t_3 9.265775e-09
## 194 gen4 9.906682e-09
## 195 qa10_4 1.858086e-08
## 196 qb5_4 2.126871e-08
## 197 qa4t_4 2.503853e-08
## 198 qb5_3 2.971480e-08
## 199 qc2_13 3.365795e-08
## 200 qc4_17 3.777663e-08
## 201 qa3_3 4.663245e-08
## 202 qa18b_1 5.692094e-08
## 203 qa4b_2 7.597421e-08
## 204 sd2t 9.306867e-08
## 205 qc16_1 1.009968e-07
## 206 sd2_3 1.333819e-07
## 207 qa5t_8 1.371978e-07
## 208 qa18b_4 1.677539e-07
## 209 qc2_11 2.080382e-07
## 210 qa5b_3 3.181619e-07
## 211 qa10_5 3.471105e-07
## 212 q1_13 3.638465e-07
## 213 qc4_12 3.842966e-07
## 214 p6hr 4.821467e-07
## 215 euroz19 5.578794e-07
## 216 euronz19 5.578794e-07
## 217 p6lt 5.757985e-07
## 218 qc17_7 6.800755e-07
## 219 qa2_8 7.098682e-07
## 220 qb4_3 7.196575e-07
## 221 qc2_12 7.777325e-07
## 222 qa15_5 9.602951e-07
## 223 qb9_4 1.078878e-06
## 224 d10 1.482212e-06
## 225 d40a_r 1.528080e-06
## 226 p6ro 1.645223e-06
## 227 qa5b_1 1.688887e-06
## 228 qa5t_5 2.690016e-06
## 229 qa4t_7 2.929730e-06
## 230 qc2_4 2.996157e-06
## 231 qa2_6 3.120801e-06
## 232 qb4_1 3.238338e-06
## 233 d43a 3.411716e-06
## 234 qc2_2 4.496683e-06
## 235 nutslvl 4.874204e-06
## 236 qc2_9 6.122456e-06
## 237 qa4b_7 7.674815e-06
## 238 qa6_7 1.352228e-05
## 239 d40b_r 1.759982e-05
## 240 q1_2 2.005715e-05
## 241 qb4_5 2.015061e-05
## 242 sd2_2 2.049006e-05
## 243 qa3_4 2.589069e-05
## 244 qa2_4 2.736553e-05
## 245 p6ie 3.764434e-05
## 246 qb9_1 4.856049e-05
## 247 sd1_5 6.451635e-05
## 248 qa5t_4 6.531201e-05
## 249 eu_nms3 7.696703e-05
## 250 p7fi 1.024245e-04
## 251 qb1_9 1.208415e-04
## 252 p7ro_r 1.337295e-04
## 253 qa7 2.426730e-04
## 254 qa3_6 2.548023e-04
## 255 qa5t_12 3.089768e-04
## 256 q1_20 3.223279e-04
## 257 q1_28 4.560014e-04
## 258 qa5b_7 4.854006e-04
## 259 qa2_3 6.196268e-04
## 260 qa17 6.970119e-04
## 261 qa6_2 7.354174e-04
## 262 eu27 8.092725e-04
## 263 q1_5 8.187847e-04
## 264 q1_21 9.613980e-04
## 265 p7sk 9.944782e-04
## 266 qa15_1 1.064753e-03
## 267 qa16_2 1.352530e-03
## 268 qa2_5 1.751906e-03
## 269 qa4b_6 2.155548e-03
## 270 q1_25 2.321971e-03
## 271 qa15_3 2.400124e-03
## 272 qa16_1 2.422900e-03
## 273 q1_4 2.451687e-03
## 274 qc2_6 2.773349e-03
## 275 qc2_7 2.827257e-03
## 276 qb9_11 2.831141e-03
## 277 q1_17 2.854416e-03
## 278 qc2_10 3.152968e-03
## 279 qa3_5 3.873243e-03
## 280 qa3_1 4.025280e-03
## 281 p6fi 4.211379e-03
## 282 p6de 4.399833e-03
## 283 qa5b_2 5.148265e-03
## 284 qa5b_11 5.324307e-03
## 285 qb9_10 6.119215e-03
## 286 qc2_14 6.796681e-03
## 287 qa18t_3 6.817408e-03
## 288 q1_6 7.295408e-03
## 289 qc2_1 8.368310e-03
## 290 p13 1.063394e-02
## 291 qc2_8 1.080884e-02
## 292 qa5b_6 1.101767e-02
## 293 q1_11 1.166239e-02
## 294 qa4b_4 1.275666e-02
## 295 q1_9 1.370648e-02
## 296 qa4t_5 1.396210e-02
## 297 qb9_9 1.474250e-02
## 298 d40c_r 1.657626e-02
## 299 p6sk 1.690870e-02
## 300 qa18b_2 1.694901e-02
## 301 p6it 2.186514e-02
## 302 qa2_1 2.281163e-02
## 303 qa5b_9 2.649771e-02
## 304 qc4_11 2.671142e-02
## 305 gen1 2.673010e-02
## 306 p13lv 2.756553e-02
## 307 qb1_6 2.881619e-02
## 308 p6lv 2.911116e-02
## 309 p13fi 3.011088e-02
## 310 qa4t_1 3.177702e-02
## 311 qa10_1 3.357706e-02
## 312 sd2_6 4.182597e-02
## 313 sd2_7 4.294333e-02
##
continuous_vars <- names(var_uniqueness[!is.na(var_uniqueness) & var_uniqueness >= 6])
continuous_vars <- setdiff(continuous_vars, c("nonresponse", "qc19"))
correlation_results <- data.frame(variable = character(),
correlation = numeric(),
stringsAsFactors = FALSE)
for (var in continuous_vars) {
# Calculate correlation
cor_val <- cor(nonresp_analysis$nonresponse, nonresp_analysis[[var]],
use = "pairwise.complete.obs")
# Store result
correlation_results <- rbind(correlation_results,
data.frame(variable = var, correlation = cor_val))
}
# Show strongest correlations
correlation_results %>% arrange(desc(abs(correlation)))
## variable correlation
## 1 p7ro 1.961854e-01
## 2 p7gr 1.858957e-01
## 3 p7ie 1.701746e-01
## 4 netuse 1.320891e-01
## 5 d62_1 1.302669e-01
## 6 d62_3 1.301065e-01
## 7 p7cy -1.284124e-01
## 8 qc12_11r 1.268925e-01
## 9 qc12_12r 1.267441e-01
## 10 d1r2 1.220472e-01
## 11 qc13_4r 1.200788e-01
## 12 qc13_8r 1.200647e-01
## 13 qc18_2 -1.188009e-01
## 14 qc18_3 -1.184702e-01
## 15 qc12_4r 1.119538e-01
## 16 qc12_7r 1.107519e-01
## 17 qc13_14r 1.105615e-01
## 18 qc12_8r 1.104514e-01
## 19 qc13_10 -1.102491e-01
## 20 qc12_14r 1.089403e-01
## 21 qc12_9r 1.048803e-01
## 22 qc13_11 -1.039340e-01
## 23 qc12_15r 1.029836e-01
## 24 p7pt -1.026675e-01
## 25 qc12_5r 1.015150e-01
## 26 qc12_13r 1.006697e-01
## 27 p7ee 9.993185e-02
## 28 qc1_6 9.991443e-02
## 29 qc13_12 -9.977293e-02
## 30 d11 9.818044e-02
## 31 qc9_10 9.744025e-02
## 32 qc9_11 9.733104e-02
## 33 qc13_5r 9.658877e-02
## 34 d11r3 9.652444e-02
## 35 p7pl_r -9.547155e-02
## 36 d63 -9.471330e-02
## 37 qc13_9r 9.379196e-02
## 38 qc9_5 9.374359e-02
## 39 qc13_12r 9.244527e-02
## 40 qc13_11r 9.138196e-02
## 41 qc14 9.011918e-02
## 42 d11r2 8.982005e-02
## 43 qc13_7r 8.977956e-02
## 44 qc12_10r 8.886650e-02
## 45 d62_4 8.878063e-02
## 46 qa14 -8.872688e-02
## 47 qc12_3r 8.844414e-02
## 48 qc1_9 8.806345e-02
## 49 qc13_13r 8.756854e-02
## 50 d62_2 8.737139e-02
## 51 qc1_1 8.732439e-02
## 52 qc12_6r 8.498051e-02
## 53 qc9_2 8.406519e-02
## 54 qc1_2 8.247491e-02
## 55 qc12_2r 8.210264e-02
## 56 qc9_8 7.925402e-02
## 57 qc9_3 7.922324e-02
## 58 qc13_6r 7.639419e-02
## 59 qc9_1 7.593227e-02
## 60 qc13_2 -7.521485e-02
## 61 d7r2 7.414868e-02
## 62 qc9_9 7.377205e-02
## 63 qc9_4 7.252654e-02
## 64 qc1_4 7.133556e-02
## 65 qc13_15r 7.042722e-02
## 66 p7dk -7.021620e-02
## 67 qc9_6 6.973673e-02
## 68 qc9_7 6.943869e-02
## 69 qc1_8 6.693601e-02
## 70 qc13_3r 6.605581e-02
## 71 p7it -6.550028e-02
## 72 qc1_5 6.390597e-02
## 73 p7it_r2 -6.300557e-02
## 74 qa18a -6.256839e-02
## 75 p7es_r2 -6.217328e-02
## 76 qc12_1r 6.216130e-02
## 77 p7se -6.162646e-02
## 78 qc1_7 6.122670e-02
## 79 qc1_10 6.098965e-02
## 80 qc13_1 -6.043364e-02
## 81 qc13_3 -6.036766e-02
## 82 qc1_3 6.034092e-02
## 83 qc6_2 -5.859010e-02
## 84 qc12_15 5.853208e-02
## 85 w24 5.839735e-02
## 86 w94 5.711731e-02
## 87 p7it_r1 -5.694419e-02
## 88 qc13_1r 5.660931e-02
## 89 p7si -5.650184e-02
## 90 d8 -5.571681e-02
## 91 p7es_r1 -5.461270e-02
## 92 sd3 -5.407886e-02
## 93 p7de 5.350126e-02
## 94 d15a_r2 5.342388e-02
## 95 qc6_10 -5.205970e-02
## 96 w84 5.197034e-02
## 97 p7be -5.125235e-02
## 98 qc13_2r 5.106552e-02
## 99 qc13_7 -5.051020e-02
## 100 qa9 5.022430e-02
## 101 qc6_11 -5.005109e-02
## 102 qc18_1 -4.802767e-02
## 103 qc12_10 -4.731778e-02
## 104 p7pl -4.681212e-02
## 105 p7at 4.323600e-02
## 106 d8r2 -4.268455e-02
## 107 qc6_4 -4.252304e-02
## 108 w13 4.137546e-02
## 109 qc13_8 4.093315e-02
## 110 qc12_11 -4.081044e-02
## 111 qc13_6 -3.986910e-02
## 112 d7r1 3.956674e-02
## 113 qc12_12 -3.806482e-02
## 114 qc12_8 3.785765e-02
## 115 qc13_13 -3.776671e-02
## 116 d40abc -3.759386e-02
## 117 qc12_1 -3.656406e-02
## 118 p7lv -3.623722e-02
## 119 qa5a 3.364841e-02
## 120 d15a -3.361297e-02
## 121 qc10 3.355247e-02
## 122 qc6_5 -3.322756e-02
## 123 qc13_4 3.295031e-02
## 124 qc12_4 3.224803e-02
## 125 w10 -3.215236e-02
## 126 qc12_14 3.166535e-02
## 127 p7gb_r -3.125343e-02
## 128 d40b -3.080266e-02
## 129 qc6_1 3.064529e-02
## 130 p7bg 2.995536e-02
## 131 qc13_10r 2.780521e-02
## 132 p7lt -2.708901e-02
## 133 w11 -2.669060e-02
## 134 qc12_2 -2.569431e-02
## 135 w8 -2.537512e-02
## 136 d40a -2.480309e-02
## 137 d7 2.394939e-02
## 138 qc12_3 -2.389543e-02
## 139 w9 -2.273540e-02
## 140 qc13_15 2.269340e-02
## 141 d7r3 -2.176323e-02
## 142 w30 2.139600e-02
## 143 d1 2.111394e-02
## 144 p7 -2.067377e-02
## 145 w29 -2.051203e-02
## 146 qc13_14 2.046962e-02
## 147 w81 -1.890283e-02
## 148 w82 1.867097e-02
## 149 w89 -1.861186e-02
## 150 w98 -1.827807e-02
## 151 w90 1.815757e-02
## 152 w85 -1.796005e-02
## 153 qc6_6 1.795772e-02
## 154 qc12_6 -1.784957e-02
## 155 w7 -1.780729e-02
## 156 w83 1.755898e-02
## 157 d40c -1.730106e-02
## 158 w6 -1.702210e-02
## 159 w99 1.676012e-02
## 160 w86 1.616610e-02
## 161 qa4a -1.543773e-02
## 162 d15b 1.527444e-02
## 163 qc13_5 -1.478621e-02
## 164 qc12_9 1.420123e-02
## 165 w14 -1.353171e-02
## 166 qc3 -1.322936e-02
## 167 qc12_7 -1.269117e-02
## 168 qc6_9 1.222451e-02
## 169 qc7 -1.208909e-02
## 170 w1 -1.179826e-02
## 171 w5 -1.173662e-02
## 172 qc6_8 1.173133e-02
## 173 qc12_13 1.133439e-02
## 174 qc8 -1.051202e-02
## 175 qc6_3 -1.026962e-02
## 176 p7fr -9.833273e-03
## 177 p7fr_r -9.373501e-03
## 178 p7cz -9.242256e-03
## 179 p2 -9.232808e-03
## 180 qc6_7 9.021552e-03
## 181 p7hu 8.947736e-03
## 182 qc12_5 8.808118e-03
## 183 p3 -8.225566e-03
## 184 qa13 7.855707e-03
## 185 w23 -7.268646e-03
## 186 wex -7.268645e-03
## 187 qc13_9 -6.882890e-03
## 188 w22 -6.743143e-03
## 189 p7nl -6.721291e-03
## 190 p7es 5.942449e-03
## 191 w3 4.828364e-03
## 192 p3r -3.139806e-03
## 193 w87 -1.910455e-03
## 194 p7gb 1.732731e-03
## 195 d8r1 1.383810e-03
## 196 p7hr 3.642174e-04
## 197 p1 -6.137684e-05
We can see that non-response in qc19 is not random; it is highly correlated to numerous variables (both categorical and continuous) such as countries (various q1) or regions (p7) responses to LGBT/ discrimination questions (such as qc6, qc7, qc12, qc13 and qc18) and paradata/ survey cooperation (p5). Therefore, the 12% missingness of qc19 is definitely not MCAR, but at least MAR or even MNAR.
“Don’t know” (DK) responses are especially problematic for ordinal variables, because leaving them as is would inflate support levels, while recoding them as 0 could misrepresent them as the most negative response. We will therefore code all DKs and refusals (and a few other special cases) as NA.
data <- read_dta("data/raw/ZA7575.dta")
data_correctly_coded <- data %>%
mutate(
# hard cases for me
d72_1 = ifelse(d72_1 %in% c(5,6), NA, d72_1),
d72_2 = ifelse(d72_2 %in% c(5,6), NA, d72_2),
d60 = ifelse(d60 == 7, NA, d60),
d25 = ifelse(d25 == 8, NA, d25),
d8 = ifelse(d8 > 70, NA, d8), # subjective decision that no one can have more than 70 years of ed (even full-time professors)
d7 = ifelse(d7 %in% c(15,97), NA, d7),
d1 = ifelse(d1 %in% c(97,98), NA, d1),
qa16_1 = ifelse(qa16_1 == 5, NA, qa16_1),
qa16_2 = ifelse(qa16_2 == 5, NA, qa16_2),
qa16_3 = ifelse(qa16_3 == 5, NA, qa16_3),
qa16_4 = ifelse(qa16_4 == 5, NA, qa16_4),
d71_1 = ifelse(d71_1 == 4, NA, d71_1),
d71_2 = ifelse(d71_2 == 4, NA, d71_2),
d71_3 = ifelse(d71_3 == 4, NA, d71_3),
#qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
#qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
#qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
#qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
qb3_1 = ifelse(qb3_1 == 5, NA, qb3_1),
qb3_2 = ifelse(qb3_2 == 5, NA, qb3_2),
qb3_3 = ifelse(qb3_3 == 5, NA, qb3_3),
qb3_4 = ifelse(qb3_4 == 5, NA, qb3_4),
qb3_5 = ifelse(qb3_5 == 5, NA, qb3_5),
qb3_6 = ifelse(qb3_6 == 5, NA, qb3_6),
qb3_7 = ifelse(qb3_7 == 5, NA, qb3_7),
qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
qb4_5 = ifelse(qb4_5 == 5, NA, qb4_5),
qb5_1 = ifelse(qb5_1 == 5, NA, qb5_1),
qb5_2 = ifelse(qb5_2 == 5, NA, qb5_2),
qb5_3 = ifelse(qb5_3 == 5, NA, qb5_3),
qb5_4 = ifelse(qb5_4 == 5, NA, qb5_4),
sd1_1 = ifelse(sd1_1 %in% c(3,4), NA, sd1_1),
sd1_2 = ifelse(sd1_2 %in% c(3,4), NA, sd1_2),
sd1_3 = ifelse(sd1_3 %in% c(3,4), NA, sd1_3),
sd1_4 = ifelse(sd1_4 %in% c(3,4), NA, sd1_4),
sd1_5 = ifelse(sd1_5 %in% c(3,4), NA, sd1_5),
sd1_6 = ifelse(sd1_6 %in% c(3,4), NA, sd1_6),
sd1_7 = ifelse(sd1_7 %in% c(3,4), NA, sd1_7),
sd1_8 = ifelse(sd1_8 %in% c(3,4), NA, sd1_8),
qc1_1 = ifelse(qc1_1 == 6, NA, qc1_1),
qc1_2 = ifelse(qc1_2 == 6, NA, qc1_2),
qc1_3 = ifelse(qc1_3 == 6, NA, qc1_3),
qc1_4 = ifelse(qc1_4 == 6, NA, qc1_4),
qc1_5 = ifelse(qc1_5 == 6, NA, qc1_5),
qc1_6 = ifelse(qc1_6 == 6, NA, qc1_6),
qc1_7 = ifelse(qc1_7 == 6, NA, qc1_7),
qc1_8 = ifelse(qc1_8 == 6, NA, qc1_8),
qc1_9 = ifelse(qc1_9 == 6, NA, qc1_9),
qc1_10 = ifelse(qc1_10 == 6, NA, qc1_10),
qc5_1 = ifelse(qc5_1 == 3, NA, qc5_1),
qc5_2 = ifelse(qc5_2 == 3, NA, qc5_2),
qc5_3 = ifelse(qc5_3 == 3, NA, qc5_3),
qc5_4 = ifelse(qc5_4 == 3, NA, qc5_4),
qc6_1 = ifelse(qc6_1 == 12, NA, qc6_1),
qc6_2 = ifelse(qc6_2 == 12, NA, qc6_2),
qc6_3 = ifelse(qc6_3 == 12, NA, qc6_3),
qc6_4 = ifelse(qc6_4 == 12, NA, qc6_4),
qc6_5 = ifelse(qc6_5 == 12, NA, qc6_5),
qc6_6 = ifelse(qc6_6 == 12, NA, qc6_6),
qc6_7 = ifelse(qc6_7 == 12, NA, qc6_7),
qc6_8 = ifelse(qc6_8 == 12, NA, qc6_8),
qc6_9 = ifelse(qc6_9 == 12, NA, qc6_9),
qc6_10 = ifelse(qc6_10 == 12, NA, qc6_10),
qc6_11 = ifelse(qc6_11 == 12, NA, qc6_11),
qc9_1 = ifelse(qc9_1 == 7, NA, qc9_1),
qc9_2 = ifelse(qc9_2 == 7, NA, qc9_2),
qc9_3 = ifelse(qc9_3 == 7, NA, qc9_3),
qc9_4 = ifelse(qc9_4 == 7, NA, qc9_4),
qc9_5 = ifelse(qc9_5 == 7, NA, qc9_5),
qc9_6 = ifelse(qc9_6 == 7, NA, qc9_6),
qc9_7 = ifelse(qc9_7 == 7, NA, qc9_7),
qc9_8 = ifelse(qc9_8 == 7, NA, qc9_8),
qc9_9 = ifelse(qc9_9 == 7, NA, qc9_9),
qc9_10 = ifelse(qc9_10 == 7, NA, qc9_10),
qc9_11 = ifelse(qc9_11 == 7, NA, qc9_11),
qc11_1 = ifelse(qc11_1 == 6, NA, qc11_1),
qc11_2 = ifelse(qc11_2 == 6, NA, qc11_2),
qc11_3 = ifelse(qc11_3 == 6, NA, qc11_3),
qc11_4 = ifelse(qc11_4 == 6, NA, qc11_4),
qc11_5 = ifelse(qc11_5 == 6, NA, qc11_5),
qc11_6 = ifelse(qc11_6 == 6, NA, qc11_6),
qc12_1 = ifelse(qc12_1 %in% c(11,12,13), NA, qc12_1),
qc12_2 = ifelse(qc12_2 %in% c(11,12,13), NA, qc12_2),
qc12_3 = ifelse(qc12_3 %in% c(11,12,13), NA, qc12_3),
qc12_4 = ifelse(qc12_4 %in% c(11,12,13), NA, qc12_4),
qc12_5 = ifelse(qc12_5 %in% c(11,12,13), NA, qc12_5),
qc12_6 = ifelse(qc12_6 %in% c(11,12,13), NA, qc12_6),
qc12_7 = ifelse(qc12_7 %in% c(11,12,13), NA, qc12_7),
qc12_8 = ifelse(qc12_8 %in% c(11,12,13), NA, qc12_8),
qc12_9 = ifelse(qc12_9 %in% c(11,12,13), NA, qc12_9),
qc12_10 = ifelse(qc12_10 %in% c(11,12,13), NA, qc12_10),
qc12_11 = ifelse(qc12_11 %in% c(11,12,13), NA, qc12_11),
qc12_12 = ifelse(qc12_12 %in% c(11,12,13), NA, qc12_12),
qc12_13 = ifelse(qc12_13 %in% c(11,12,13), NA, qc12_13),
qc12_14 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_14),
qc12_15 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_15),
qc13_1 = ifelse(qc13_1 %in% c(11,12,13), NA, qc13_1),
qc13_2 = ifelse(qc13_2 %in% c(11,12,13), NA, qc13_2),
qc13_3 = ifelse(qc13_3 %in% c(11,12,13), NA, qc13_3),
qc13_4 = ifelse(qc13_4 %in% c(11,12,13), NA, qc13_4),
qc13_5 = ifelse(qc13_5 %in% c(11,12,13), NA, qc13_5),
qc13_6 = ifelse(qc13_6 %in% c(11,12,13), NA, qc13_6),
qc13_7 = ifelse(qc13_7 %in% c(11,12,13), NA, qc13_7),
qc13_8 = ifelse(qc13_8 %in% c(11,12,13), NA, qc13_8),
qc13_9 = ifelse(qc13_9 %in% c(11,12,13), NA, qc13_9),
qc13_10 = ifelse(qc13_10 %in% c(11,12,13), NA, qc13_10),
qc13_11 = ifelse(qc13_11 %in% c(11,12,13), NA, qc13_11),
qc13_12 = ifelse(qc13_12 %in% c(11,12,13), NA, qc13_12),
qc13_13 = ifelse(qc13_13 %in% c(11,12,13), NA, qc13_13),
qc13_14 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_14),
qc13_15 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_15),
qc15_1 = ifelse(qc15_1 == 5, NA, qc15_1),
qc15_2 = ifelse(qc15_2 == 5, NA, qc15_2),
qc15_3 = ifelse(qc15_3 == 5, NA, qc15_3),
qc16_1 = ifelse(qc16_1 == 5, NA, qc16_1),
qc17_1 = ifelse(qc17_1 == 5, NA, qc17_1),
qc17_2 = ifelse(qc17_2 == 5, NA, qc17_2),
qc17_3 = ifelse(qc17_3 == 5, NA, qc17_3),
qc17_4 = ifelse(qc17_4 == 5, NA, qc17_4),
qc17_5 = ifelse(qc17_5 == 5, NA, qc17_5),
qc17_6 = ifelse(qc17_6 == 5, NA, qc17_6),
qc17_7 = ifelse(qc17_7 == 5, NA, qc17_7),
qc18_1 = ifelse(qc18_1 %in% c(11,12), NA, qc18_1),
qc18_2 = ifelse(qc18_2 %in% c(11,12), NA, qc18_2),
qc18_3 = ifelse(qc18_3 %in% c(11,12), NA, qc18_3),
sd3 = ifelse(sd3 %in% c(15,16), NA, sd3),
qa1 = ifelse(qa1 == 5, NA, qa1),
qa7 = ifelse(qa7 == 5, NA, qa7),
qa8 = ifelse(qa8 == 4, NA, qa8),
qa9 = ifelse(qa9 == 6, NA, qa9),
qa14 = ifelse(qa14 == 6, NA, qa14),
qa17 = ifelse(qa17 == 5, NA, qa17),
qb6 = ifelse(qb6 == 4, NA, qb6),
qb7 = ifelse(qb7 == 5, NA, qb7),
qc19 = ifelse(qc19 == 3, NA, qc19),
qc20 = ifelse(qc20 == 3, NA, qc20),
d63 = ifelse(d63 %in% c(6,7,8,9), NA, d63), # manually because stupid Claude
d77 = ifelse(d77 == 5, NA, d77),
d70 = ifelse(d70 == 5, NA, d70),
qa4a = ifelse(qa4a %in% c(7,8), NA, qa4a),
qa5a = ifelse(qa5a %in% c(11,12,13), NA, qa5a),
qa11 = ifelse(qa11 %in% c(4,5), NA, qa11),
qa12 = ifelse(qa12 %in% c(5,6), NA, qa12),
qa13 = ifelse(qa13 %in% c(6,7), NA, qa13),
qa18a = ifelse(qa18a %in% c(7,8,9), NA, qa18a),
qb8 = ifelse(qb8 == 5, NA, qb8),
sd3 = ifelse(sd3 == 16, NA, sd3),
qc3 = ifelse(qc3 %in% c(10,11), NA, qc3),
qc7 = ifelse(qc7 %in% c(11,12), NA, qc7),
qc8 = ifelse(qc8 %in% c(11,12), NA, qc8),
qc10 = ifelse(qc10 %in% c(9,10), NA, qc10))
###
# get rid of all the "r" coded ones
# for the questions with underscores, (1) if they are on a scale, e.g., 1-3 and 'DK'
# is one of them, replace 'DK'; if it's 0-1 coding, keep that (can't do anything about it)
data_correctly_coded <- data_correctly_coded %>%
select(-matches("_r$|_r[0-9]$|_r[0-9]_"))
# use setdiff() to check whether it's done properly
###
# now analyse the patterns of NAs across columns
missing_data_before <- data_correctly_coded %>%
summarise(across(everything(), ~sum(is.na(.))/n())) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "missing_proportion") %>%
arrange(desc(missing_proportion))
# view top variables with missing data
print(head(missing_data_before, 20))
## # A tibble: 20 × 2
## variable missing_proportion
## <chr> <dbl>
## 1 p6mt 0.982
## 2 p13mt 0.982
## 3 p6cy 0.982
## 4 p7cy 0.982
## 5 p6lu 0.981
## 6 p7lu 0.981
## 7 p13lu 0.981
## 8 p6hr 0.964
## 9 p7hr 0.964
## 10 p6ee 0.963
## 11 p6fi 0.963
## 12 p6lt 0.963
## 13 p7ee 0.963
## 14 p7fi 0.963
## 15 p7lt 0.963
## 16 p13ee 0.963
## 17 p13fi 0.963
## 18 p6dk 0.963
## 19 p7dk 0.963
## 20 p6es 0.963
# viz
ggplot(missing_data_before %>% filter(missing_proportion > 0.25),
aes(x = reorder(variable, missing_proportion), y = missing_proportion)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variables with >25% Missing Data",
x = "Variable",
y = "Proportion Missing") +
theme_minimal()
# calculate overall proportion of missing data
mean(missing_data_before$missing_proportion)
## [1] 0.1424237
# deeper analysis
missing_by_prefix <- missing_data_before %>%
mutate(prefix = str_extract(variable, "^[a-z]+\\d+")) %>%
group_by(prefix) %>%
summarise(
avg_missing = mean(missing_proportion),
n_vars = n(),
max_missing = max(missing_proportion),
min_missing = min(missing_proportion)
) %>%
arrange(desc(avg_missing))
print(head(missing_by_prefix, 15))
## # A tibble: 15 × 5
## prefix avg_missing n_vars max_missing min_missing
## <chr> <dbl> <int> <dbl> <dbl>
## 1 p13 0.945 8 0.982 0.779
## 2 p6 0.931 29 0.982 0
## 3 p7 0.930 28 0.982 0.0180
## 4 qc3 0.861 1 0.861 0.861
## 5 qa3 0.679 7 0.679 0.679
## 6 qa15 0.627 5 0.627 0.627
## 7 qa2 0.376 8 0.376 0.376
## 8 d15 0.261 2 0.522 0
## 9 qb7 0.208 1 0.208 0.208
## 10 d40 0.157 5 0.784 0
## 11 qa13 0.124 1 0.124 0.124
## 12 qc19 0.120 1 0.120 0.120
## 13 qc16 0.117 1 0.117 0.117
## 14 qc20 0.116 1 0.116 0.116
## 15 qc10 0.109 1 0.109 0.109
# we take out the following variables because (1) they exhibited high missingness
# while at the same aren't super releveant (that's an assumption we make) for our
# subsequent analysis
data_correctly_coded <- data_correctly_coded %>%
select(
# Exclude variables with specific prefixes
-starts_with("p6"), # Size of locality
-starts_with("p7"), # Region
-starts_with("p13"), # Language of interview
-starts_with("d40"), # Household size
-starts_with("qa3"), # Not benefitting from trade
-starts_with("qa15"), # Countries bought goods from
-starts_with("qa2"), # Benefitting from trade
-starts_with("qb8"), # EU energy label influence
-starts_with("qa18b"), # Information sources follow-up
-starts_with("qc3") # Discrimination circumstances
)
# check how much we imporved the missingness rate
missing_data_after <- data_correctly_coded %>%
summarise(across(everything(), ~sum(is.na(.))/n())) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "missing_proportion") %>%
arrange(desc(missing_proportion))
mean(missing_data_after$missing_proportion)
## [1] 0.02018962
# prepare for imputation
data_correctly_coded <- data_correctly_coded %>%
mutate(isocntry = as.factor(isocntry)) %>%
# remove other problematic variables (like IDs) if needed
select(!any_of(c("doi", "version", "caseid", "uniqid", "serialid"))) %>%
sjlabelled::remove_all_labels()
# delete variables with zero variance as they cannot ever be useful predictors
var_zero <- data_correctly_coded %>%
select(where(is.numeric)) %>%
summarise(across(everything(), ~ var(.x, na.rm = TRUE))) %>%
pivot_longer(everything(), names_to = "variable", values_to = "variance") %>%
filter(variance == 0 | is.na(variance))
if(nrow(var_zero) > 0) {
cat("Removing", nrow(var_zero), "variables with zero variance\n")
data_correctly_coded <- data_correctly_coded %>%
select(!any_of(var_zero$variable))
}
## Removing 6 variables with zero variance
data_correctly_coded <- data_correctly_coded %>%
# convert country codes: collapse East and West Germany into one DE
mutate(isocntry = case_when(
isocntry %in% c("DE-W", "DE-E") ~ "DE",
TRUE ~ isocntry))
###
# run the imputation
set.seed(1212)
start_time <- Sys.time()
#imp_model <- mice(data_correctly_coded, m = 3, method = 'rf', maxit = 3)
end_time <- Sys.time()
end_time - start_time
## Time difference of 0.0008299351 secs
We also needed to make decisions about whether or not to impute missing variables from a few of our external datasets. We chose to delete the variables from the ESS because the Eurobarometer survey already contains similar information and imputing these values didn’t seem appropriate. However, we did choose to impute the happiness score using KNN.
country_level_df <- readRDS("country_level_df.rds")
# first, let's check which variables have missing values
missing_summary <- colSums(is.na(country_level_df))
missing_summary <- missing_summary[missing_summary > 0]
print(missing_summary)
## Happiness_Score n_respondents n_valid
## 1 4 4
## lgbt_support_percent mean_religiosity mean_left_right
## 4 4 4
## mean_equal_values mean_country_attach mean_eduyrs
## 4 4 4
## mean_age pct_young pct_high_educ
## 4 4 4
## se_lgbt_support pct_missing_lgbt age_lgbt_corr
## 4 4 4
## relig_lgbt_corr lgbt_support_inequality educ_gradient
## 4 4 4
## gender_equality_index z_lgbt_support norm_lgbt_support
## 1 4 4
## z_happiness z_gender_equality norm_gender_equality
## 1 1 1
## z_religiosity z_left_right z_equal_values
## 4 4 4
## z_country_attach z_eduyrs z_age
## 4 4 4
# Visualize missing data patterns
vis_miss(country_level_df)
# (1)
# delete ESS variables
ESS_variables <- c(
# original survey variables
"n_respondents", "n_valid", "lgbt_support_percent", "mean_religiosity",
"mean_left_right", "mean_equal_values", "mean_country_attach",
"mean_eduyrs", "mean_age", "pct_young", "pct_high_educ", "se_lgbt_support",
# additional ESS metrics
"pct_missing_lgbt", "age_lgbt_corr", "relig_lgbt_corr",
"lgbt_support_inequality", "educ_gradient",
# z-score transformations of ESS variables
"z_religiosity", "z_left_right", "z_equal_values",
"z_country_attach", "z_eduyrs", "z_age")
country_level_df <- country_level_df %>%
select(-all_of(ESS_variables))
# (2)
# create separate datasets for variables that need imputation
missing_variables <- names(missing_summary)[missing_summary < 5 &
!(names(missing_summary) %in% ESS_variables)]
# for our KNN imputation, we need to identify variables to use as predictors
# these should be variables without NAs that also correlate with those we want to impute
# identify complete variables that could serve as predictors
complete_variables <- names(country_level_df)[colSums(is.na(country_level_df)) == 0]
complete_variables <- complete_variables[!complete_variables %in% c("Unnamed: 0", "country_name", "iso2", "region")]
print(complete_variables)
## [1] "v2x_libdem" "v2x_polyarchy"
## [3] "v2x_gender" "v2x_egaldem"
## [5] "v2x_liberal" "v2xcs_ccsi"
## [7] "v2x_freexp" "Overall_score"
## [9] "Electoral_process_and_pluralism" "Functioning_of_government"
## [11] "Political_participation" "Political_culture"
## [13] "Civil_liberties" "Regime_type"
## [15] "gdp_2005" "gdp_2018"
## [17] "gdp_growth" "rainbow_score_2019"
## [19] "rainbow_score_2018" "rainbow_score_avg_2019_2018"
## [21] "rainbow_score_avg_2013_2014" "rainbow_score_difference"
## [23] "Unemployment" "Employment"
## [25] "z_v2x_libdem" "z_v2x_polyarchy"
## [27] "z_v2x_gender" "z_v2x_egaldem"
## [29] "z_v2x_liberal" "z_v2xcs_ccsi"
## [31] "z_v2x_freexp" "z_gdp_2018"
## [33] "log_gdp_2018" "z_gdp_growth"
## [35] "z_unemployment" "z_rainbow_score"
## [37] "norm_rainbow_score" "z_functioning_of_government"
## [39] "composite_equality" "composite_democracy"
# custom function to impute variables with KNN, handling non-numeric data
knn_impute <- function(data, vars_to_impute, predictor_vars, k = 5) {
# create a copy of the data
imputed_data <- data
# ensure predictor variables are numeric and complete
pred_data <- data[, predictor_vars, drop = FALSE]
pred_data <- as.data.frame(lapply(pred_data, function(x) as.numeric(x)))
# remove any non-numeric columns
numeric_cols <- sapply(pred_data, is.numeric)
if(any(!numeric_cols)) {
warning("Removing non-numeric predictor columns: ",
paste(names(pred_data)[!numeric_cols], collapse=", "))
pred_data <- pred_data[, numeric_cols, drop=FALSE]
predictor_vars <- predictor_vars[numeric_cols]
}
# handle missing values in predictor variables by using column means
for(col in names(pred_data)) {
if(any(is.na(pred_data[[col]]))) {
pred_data[[col]][is.na(pred_data[[col]])] <- mean(pred_data[[col]], na.rm=TRUE)
}
}
# standardize the predictor variables manually to avoid issues
pred_data_std <- as.data.frame(scale(pred_data))
# for each variable to impute:
for (var in vars_to_impute) {
if (var %in% names(data) && sum(is.na(data[[var]])) > 0) {
# ensure the target variable is numeric
if (!is.numeric(data[[var]])) {
cat("Skipping", var, "as it is not numeric\n")
next
}
# identify cases with missing values
missing_indices <- which(is.na(data[[var]]))
# for each missing value:
for (idx in missing_indices) {
# calculate Euclidean distances to all other countries
distances <- numeric(nrow(data))
for (i in 1:nrow(data)) {
if (i != idx) {
# calculate squared differences for each predictor
squared_diffs <- sapply(names(pred_data_std), function(p) {
(pred_data_std[idx, p] - pred_data_std[i, p])^2
})
# sum and take square root for Euclidean distance
distances[i] <- sqrt(sum(squared_diffs, na.rm=TRUE))
} else {
distances[i] <- Inf # don't use the country itself
}
}
# find k nearest neighbors with non-missing values for this variable
valid_neighbors <- which(!is.na(data[[var]]) & !is.infinite(distances))
if (length(valid_neighbors) > 0) {
# order the neighbors by distance
neighbor_order <- order(distances[valid_neighbors])
valid_neighbors <- valid_neighbors[neighbor_order]
# take the k nearest, or as many as available if fewer than k
k_actual <- min(k, length(valid_neighbors))
nearest_k <- valid_neighbors[1:k_actual]
# impute as the average of the k nearest neighbors
imputed_data[idx, var] <- mean(data[nearest_k, var], na.rm = TRUE)
cat("Imputed", var, "for", data$country_name[idx],
"using neighbors:", paste(data$country_name[nearest_k], collapse=", "), "\n")
} else {
cat("Warning: Could not impute", var, "for", data$country_name[idx],
"- no valid neighbors found\n")
}
}
}
}
return(imputed_data)
}
# now use the function to impute variables with missing values
# first ensure we have the right variable types
country_df_numeric <- country_level_df
for (var in missing_variables) {
if (var %in% names(country_level_df) && !is.numeric(country_level_df[[var]])) {
country_df_numeric[[var]] <- as.numeric(as.character(country_level_df[[var]]))
}
}
# run the imputation with proper error handling
tryCatch({
country_df_imputed <- knn_impute(
data = country_df_numeric,
vars_to_impute = missing_variables,
predictor_vars = complete_variables,
k = 3 # using 3 nearest neighbors
)
print("Imputation completed successfully!")
}, error = function(e) {
# if the knn_impute function still fails, let's try an alternative approach
print(paste("KNN imputation error:", e$message))
print("Switching to a simpler mean imputation approach...")
country_df_imputed <- country_df_numeric
for (var in missing_variables) {
if (sum(is.na(country_df_imputed[[var]])) > 0) {
# Calculate mean of non-missing values
var_mean <- mean(country_df_imputed[[var]], na.rm = TRUE)
# Replace missing values with mean
country_df_imputed[[var]][is.na(country_df_imputed[[var]])] <- var_mean
print(paste("Imputed", var, "with mean value:", var_mean))
}
}
return(country_df_imputed)
})
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Imputed Happiness_Score for Belgium using neighbors: France, Portugal, Denmark
## Imputed gender_equality_index for United Kingdom using neighbors: Netherlands, Austria, France
## Imputed z_lgbt_support for Luxembourg using neighbors: Ireland, Austria, Germany
## Imputed z_lgbt_support for Greece using neighbors: Spain, Slovenia, Italy
## Imputed z_lgbt_support for Malta using neighbors: United Kingdom, Croatia, Austria
## Imputed z_lgbt_support for Romania using neighbors: Poland, Bulgaria, Croatia
## Imputed norm_lgbt_support for Luxembourg using neighbors: Ireland, Austria, Germany
## Imputed norm_lgbt_support for Greece using neighbors: Spain, Slovenia, Italy
## Imputed norm_lgbt_support for Malta using neighbors: United Kingdom, Croatia, Austria
## Imputed norm_lgbt_support for Romania using neighbors: Poland, Bulgaria, Croatia
## Imputed z_happiness for Belgium using neighbors: France, Portugal, Denmark
## Imputed z_gender_equality for United Kingdom using neighbors: Netherlands, Austria, France
## Imputed norm_gender_equality for United Kingdom using neighbors: Netherlands, Austria, France
## [1] "Imputation completed successfully!"
# load the data
df_rf_new <- read_csv("df_rf_new.csv") # the imputed dataset
## New names:
## Rows: 27438 Columns: 475
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): isocntry, nuts dbl (473): ...1, tnscntry, country, d11, d11r1, d11r2,
## d11r3, gen1, gen2, ge...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
country_df_imputed <- readRDS("country_df_imputed.rds") # external dataset
# custom function to calculate country-level aggregates
calculate_country_aggregates <- function(df) {
country_agg <- df %>%
group_by(isocntry) %>%
summarize(
iso2 = first(isocntry),
# demographics
mean_age = mean(d11, na.rm = TRUE),
median_age = median(d11, na.rm = TRUE),
# life satisfaction (recoded)
mean_life_satisfaction = mean(5 - d70, na.rm = TRUE),
pct_satisfied = mean(d70 %in% c(1, 2), na.rm = TRUE) * 100,
# political discussions
mean_natl_political_discuss = mean(d71_1, na.rm = TRUE),
mean_eu_political_discuss = mean(d71_2, na.rm = TRUE),
mean_local_political_discuss = mean(d71_3, na.rm = TRUE),
pct_discuss_natl_politics = mean(d71_1 %in% c(1, 2), na.rm = TRUE) * 100,
pct_discuss_eu_politics = mean(d71_2 %in% c(1, 2), na.rm = TRUE) * 100,
pct_discuss_local_politics = mean(d71_3 %in% c(1, 2), na.rm = TRUE) * 100,
# trade and globalization (recoded)
mean_trade_support = mean(5 - qa1, na.rm = TRUE),
pct_trade_support = mean(qa1 %in% c(1, 2), na.rm = TRUE) * 100,
pct_pro_globalization = mean(qa5a %in% c(1, 2, 3, 5, 7, 8), na.rm = TRUE) * 100,
pct_anti_globalization = mean(qa5a %in% c(4, 6, 9, 10), na.rm = TRUE) * 100,
mean_eu_trade_support = mean(5 - qa7, na.rm = TRUE),
pct_support_eu_trade = mean(qa7 %in% c(1, 2), na.rm = TRUE) * 100,
mean_esg_support = mean(5 - qa9, na.rm = TRUE),
pct_support_esg = mean(qa9 %in% c(1, 2), na.rm = TRUE) * 100,
# political and social indicators
mean_left_right = mean(d1, na.rm = TRUE),
pct_left = mean(d1 %in% c(1, 2, 3, 4), na.rm = TRUE) * 100,
pct_center = mean(d1 %in% c(5, 6), na.rm = TRUE) * 100,
pct_right = mean(d1 %in% c(7, 8, 9, 10), na.rm = TRUE) * 100,
mean_education_years = mean(d8, na.rm = TRUE),
median_education_years = median(d8, na.rm = TRUE),
pct_high_education = mean(d8 >= 20, na.rm = TRUE) * 100,
pct_rural = mean(d25 == 1, na.rm = TRUE) * 100,
pct_urban = mean(d25 %in% c(2, 3), na.rm = TRUE) * 100,
pct_large_urban = mean(d25 == 3, na.rm = TRUE) * 100,
mean_financial_difficulty = mean(4 - d60, na.rm = TRUE),
pct_financial_difficulty = mean(d60 %in% c(1, 2), na.rm = TRUE) * 100,
mean_subjective_class = mean(d63, na.rm = TRUE),
pct_working_class = mean(d63 == 1, na.rm = TRUE) * 100,
pct_middle_class = mean(d63 %in% c(2, 3, 4), na.rm = TRUE) * 100,
pct_upper_class = mean(d63 == 5, na.rm = TRUE) * 100,
mean_voice_in_eu = mean(5 - d72_1, na.rm = TRUE),
mean_voice_in_country = mean(5 - d72_2, na.rm = TRUE),
pct_voice_in_eu = mean(d72_1 %in% c(1, 2), na.rm = TRUE) * 100,
pct_voice_in_country = mean(d72_2 %in% c(1, 2), na.rm = TRUE) * 100,
# diversity indicators
pct_friends_diff_ethnic = mean(sd1_1 == 1, na.rm = TRUE) * 100,
pct_friends_diff_skin = mean(sd1_2 == 1, na.rm = TRUE) * 100,
pct_friends_roma = mean(sd1_3 == 1, na.rm = TRUE) * 100,
pct_friends_lgbt = mean(sd1_4 == 1, na.rm = TRUE) * 100,
pct_friends_disabled = mean(sd1_5 == 1, na.rm = TRUE) * 100,
pct_friends_diff_religion = mean(sd1_6 == 1, na.rm = TRUE) * 100,
pct_friends_transgender = mean(sd1_7 == 1, na.rm = TRUE) * 100,
pct_friends_intersex = mean(sd1_8 == 1, na.rm = TRUE) * 100,
pct_ethnic_minority = mean(sd2_1 == 1, na.rm = TRUE) * 100,
pct_skin_minority = mean(sd2_2 == 1, na.rm = TRUE) * 100,
pct_religious_minority = mean(sd2_3 == 1, na.rm = TRUE) * 100,
pct_roma = mean(sd2_4 == 1, na.rm = TRUE) * 100,
pct_sexual_minority = mean(sd2_5 == 1, na.rm = TRUE) * 100,
pct_disability = mean(sd2_6 == 1, na.rm = TRUE) * 100,
pct_other_minority = mean(sd2_7 == 1, na.rm = TRUE) * 100,
pct_any_minority = mean(sd2_1 == 1 | sd2_2 == 1 | sd2_3 == 1 | sd2_4 == 1 |
sd2_5 == 1 | sd2_6 == 1 | sd2_7 == 1, na.rm = TRUE) * 100,
# religion
pct_catholic = mean(sd3 == 1, na.rm = TRUE) * 100,
pct_orthodox = mean(sd3 == 2, na.rm = TRUE) * 100,
pct_protestant = mean(sd3 == 3, na.rm = TRUE) * 100,
pct_other_christian = mean(sd3 == 4, na.rm = TRUE) * 100,
pct_jewish = mean(sd3 == 5, na.rm = TRUE) * 100,
pct_muslim = mean(sd3 %in% c(6, 7, 8), na.rm = TRUE) * 100,
pct_atheist = mean(sd3 == 13, na.rm = TRUE) * 100,
pct_nonbeliever = mean(sd3 == 14, na.rm = TRUE) * 100,
pct_nonreligious = mean(sd3 %in% c(13, 14), na.rm = TRUE) * 100,
# subjective discrimination perception (recoded)
mean_discrim_ethnic = mean(5 - qc1_1, na.rm = TRUE),
mean_discrim_skin = mean(5 - qc1_2, na.rm = TRUE),
mean_discrim_roma = mean(5 - qc1_3, na.rm = TRUE),
mean_discrim_lgbt = mean(5 - qc1_4, na.rm = TRUE),
mean_discrim_age = mean(5 - qc1_5, na.rm = TRUE),
mean_discrim_religion = mean(5 - qc1_6, na.rm = TRUE),
mean_discrim_disability = mean(5 - qc1_7, na.rm = TRUE),
mean_discrim_transgender = mean(5 - qc1_8, na.rm = TRUE),
mean_discrim_gender = mean(5 - qc1_9, na.rm = TRUE),
mean_discrim_intersex = mean(5 - qc1_10, na.rm = TRUE),
)
return(country_agg)
}
# apply the function
country_aggregates <- calculate_country_aggregates(df_rf_new)
# join
country_data_combined <- country_df_imputed %>%
left_join(country_aggregates, by = "iso2")
# while it's less relevant for a ML model such as random forest, for a multi-level
# regression model, it's important to standardize the values to allow for better
# coefficient interpretation; the goal is to have both z-scores and 0-1 normalised
# values for each of the original variables (except for variables like 'region')
country_data_combined <- country_data_combined %>%
# first handle all numeric variables that aren't already standardized
mutate(
# z-score standardization
across(where(is.numeric) &
!starts_with("z_") &
!starts_with("norm_") &
!matches("country|iso"),
list(z = ~scale(.)[,1]),
.names = "z_{.col}"),
# also create 0-1 normalized versions
across(where(is.numeric) &
!starts_with("z_") &
!starts_with("norm_") &
!matches("country|iso"),
list(norm = ~(.-min(., na.rm=TRUE))/(max(., na.rm=TRUE)-min(., na.rm=TRUE))),
.names = "norm_{.col}"))
# save the combined dataset
saveRDS(country_data_combined, file = "country_data_combined.rds")
write_csv(country_data_combined, "country_data_combined.csv")
# join with df_rf; the actual, imputed survey data
df_rf_enriched_new <- df_rf_new %>%
# ISO codes as key
left_join(country_data_combined, by = c("isocntry" = "iso2"))
# save final dataset
write_csv(df_rf_enriched_new, "df_rf_enriched_new.csv")
# load data
country_level_df <- readRDS("country_level_df.rds")
# plot 1: employment and unemployment Rates
p_1 <- ggplot(country_level_df, aes(x = reorder(country_name, Employment))) +
geom_bar(aes(y = Employment), stat = "identity", fill = "steelblue", alpha = 0.7) +
geom_point(aes(y = Unemployment), color = "darkred", size = 3) +
scale_y_continuous(
name = "Employment Rate (%)",
limits = c(0, 100),
sec.axis = sec_axis(~., name = "Unemployment Rate (%)")
) +
labs(title = "Employment and Unemployment Rates by Country (2018)",
subtitle = "Bars: Employment rate | Points: Unemployment rate",
x = NULL) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title.y.left = element_text(color = "steelblue"),
axis.title.y.right = element_text(color = "darkred"))
# plot 2: GDP per capita and GDP per capita growth rate
p_2 <- ggplot(country_level_df, aes(x = reorder(country_name, gdp_2018))) +
geom_bar(aes(y = gdp_2018), stat = "identity", fill = "steelblue", alpha = 0.7) +
geom_point(aes(y = gdp_growth * 500), color = "darkred", size = 3) + # Scale growth to fit on same axis
scale_y_continuous(
name = "GDP per Capita (USD)",
labels = comma,
sec.axis = sec_axis(~./500, name = "GDP Growth 2005-2018 (%)")
) +
labs(title = "GDP per Capita (2018) and Growth Rate",
subtitle = "Bars: GDP per capita | Points: Growth rate",
x = NULL) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title.y.left = element_text(color = "steelblue"),
axis.title.y.right = element_text(color = "darkred"))
# plot 3: relationship between employment and LGBT support
p_3 <- ggplot(country_level_df, aes(x = Employment, y = lgbt_support_percent)) +
geom_point(aes(color = Unemployment, size = rainbow_score_2019)) +
geom_text(aes(label = iso2), hjust = -0.3, vjust = 0, size = 3) +
scale_color_viridis_c(option = "B", name = "Unemployment\nRate (%)") +
scale_size_continuous(name = "Rainbow\nScore 2019") +
labs(title = "Relationship between Employment Rates and LGBT Support",
subtitle = "Color indicates unemployment rate, size shows level of LGBT legal protection",
x = "Employment Rate (%)",
y = "LGBT Support (%)") +
theme_minimal() +
theme(panel.grid.minor = element_blank())
# plot 4: relationship between GDP per capita to LGBT support
p_4 <- ggplot(country_level_df, aes(x = gdp_2018, y = lgbt_support_percent)) +
geom_point(aes(color = rainbow_score_2019, size = gdp_growth)) +
geom_text(aes(label = iso2), hjust = -0.3, vjust = 0, size = 3) +
scale_color_viridis_c(option = "E", name = "Rainbow\nScore 2019") +
scale_size_continuous(name = "GDP Growth\n2005-2018 (%)") +
scale_x_continuous(labels = comma) +
labs(title = "Relationship between GDP, LGBT Support and Rights Protections",
subtitle = "Size indicates GDP growth rate from 2005-2018",
x = "GDP per Capita (2018, USD)",
y = "LGBT Support (%)") +
theme_minimal() +
theme(panel.grid.minor = element_blank())
# display the plots
p_1
p_2
p_3
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_text()`).
p_4
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 4 rows containing missing values or values outside the scale range
## (`geom_text()`).
We will use this reduced dataset when building our explanatory model. We chose these variables based on previous literature on transgender rights and the results of our descriptive analysis.
#load data
df_reduced <- df_rf_enriched_new %>%
select(
# target
qc19,
# country identifier
country, country_name, isocntry,
# individual demographics
age = d11,
gender = d10,
education = d8, d8r2,
occupation = d15a,
urban_rural = d25,
financial_insecurity = d60,
social_class = d63,
religion = sd3,
political_ideology = d1,
lgb_friends = sd1_4,
trans_friends = sd1_7,
# Personal identity
ethnic_minority = sd2_1,
skin_color_minority = sd2_2,
religious_minority = sd2_3,
sexual_lgbt_minority = sd2_5,
disability_minority = sd2_6,
other_minority = sd2_7,
none_minority = sd2_8,
# Discrimination
trans_discrimination_country = qc1_8,
trans_discrimination_personal = qc2_6,
trans_discrimination_workplace = qc4_8,
trans_discrimination_political = qc6_10,
country_discrimination_efforts = qc7,
country_discrimination_efforts_recoded = qc7r,
trans_workplace_diversity = qc9_10,
trans_colleague = qc12_11,
trans_colleague_recoded = qc12_11r,
trans_child_relationship = qc13_11,
trans_child_relationship_recoded = qc13_11r,
lgb_rights = qc15_1,
same_sex_relationship = qc15_2,
same_sex_marriage = qc15_3,
lgb_school_materials = qc17_3,
trans_school_materials = qc17_4,
intersex_school_materials = qc17_5,
two_men_public_affection = qc18_2,
two_men_public_affection_recoded = qc18_2r,
two_women_public_affection = qc18_3,
two_women_public_affection_recoded = qc18_3r,
non_gendered_docs = qc20,
# Country-level variables
gdp_2018, # Economic development
rainbow_score_2019, # LGBTI rights/protections
gender_equality_index, # Gender equality
Happiness_Score, # National well-being
gdp_2018, # Economic development
rainbow_score_2019, # LGBTI rights/protections
rainbow_score_2018, # LGBTI rights/protections (previous year)
rainbow_score_avg_2019_2018, # Average LGBTI rights/protections
gender_equality_index, # Gender equality
Happiness_Score, # National well-being
v2x_libdem, # Democratic quality
v2x_egaldem, # Democratic quality
Regime_type, # Democratic quality
z_functioning_of_government, #Govenment function measure
composite_equality, # Equality measure
z_composite_equality, # Standardized equality measure
norm_composite_equality, # Normalized equality measure
z_lgbt_support, # Standardized LGBT support
norm_lgbt_support, # Normalized LGBT support
pct_friends_lgbt, # Percentage of LGBT friends
z_pct_friends_lgbt, # Standardized % of LGBT friends
norm_pct_friends_lgbt, # Normalized % of LGBT friends
mean_left_right, # Average political leaning
pct_high_education, # Education level
region, # Geographic region
# Paradata
interview_date = p1,
interview_start_time = p2,
interview_duration = p3,
interview_duration_recoded = p3r,
people_present_during_interview = p4,
respondent_cooperation = p5
)
# Save to a new file
write_rds(df_reduced, "df_reduced.rds")
# load datasets
df_reduced <- readRDS("df_reduced.rds")
country_data_combined <- readRDS("country_data_combined.rds")
# first calculate country aggregates for individual variables so that we have
# continuous data for which we can more easily calculate correlations
df_reduced_aggregated <- df_reduced %>%
group_by(country_name, isocntry) %>%
summarize(
# target
pct_support_trans = mean(qc19 == 1, na.rm = TRUE) * 100,
pct_oppose_trans = mean(qc19 == 2, na.rm = TRUE) * 100,
pct_dk_trans = mean(qc19 == 3, na.rm = TRUE) * 100,
# demographics
mean_age = mean(age, na.rm = TRUE),
pct_female = mean(gender == 2, na.rm = TRUE) * 100,
mean_education_years = mean(education, na.rm = TRUE),
pct_high_educ = mean(d8r2 >= 3, na.rm = TRUE) * 100,
pct_rural = mean(urban_rural == 1, na.rm = TRUE) * 100,
pct_urban = mean(urban_rural == 2, na.rm = TRUE) * 100,
pct_large_urban = mean(urban_rural == 3, na.rm = TRUE) * 100,
pct_financial_difficulty = mean(financial_insecurity <= 2, na.rm = TRUE) * 100,
pct_working_class = mean(social_class == 1, na.rm = TRUE) * 100,
pct_middle_class = mean(social_class %in% c(2,3,4), na.rm = TRUE) * 100,
pct_upper_class = mean(social_class == 5, na.rm = TRUE) * 100,
# values and identity
mean_political_ideology = mean(political_ideology, na.rm = TRUE),
pct_left = mean(political_ideology <= 3, na.rm = TRUE) * 100,
pct_center = mean(political_ideology > 3 & political_ideology < 8, na.rm = TRUE) * 100,
pct_right = mean(political_ideology >= 8, na.rm = TRUE) * 100,
# religious composition
pct_religious = mean(religion %in% c(1:11), na.rm = TRUE) * 100,
pct_catholic = mean(religion == 1, na.rm = TRUE) * 100,
pct_orthodox = mean(religion == 2, na.rm = TRUE) * 100,
pct_protestant = mean(religion == 3, na.rm = TRUE) * 100,
pct_other_christian = mean(religion == 4, na.rm = TRUE) * 100,
pct_muslim = mean(religion %in% c(6:8), na.rm = TRUE) * 100,
pct_other_religion = mean(religion %in% c(9:11), na.rm = TRUE) * 100,
pct_atheist = mean(religion == 13, na.rm = TRUE) * 100,
pct_nonbeliever = mean(religion == 14, na.rm = TRUE) * 100,
pct_nonreligious = mean(religion %in% c(13,14), na.rm = TRUE) * 100,
# social networks and contact
pct_lgb_friends = mean(lgb_friends == 1, na.rm = TRUE) * 100,
pct_trans_friends = mean(trans_friends == 1, na.rm = TRUE) * 100,
# personal identity/minority status
pct_ethnic_minority = mean(ethnic_minority == 1, na.rm = TRUE) * 100,
pct_skin_color_minority = mean(skin_color_minority == 1, na.rm = TRUE) * 100,
pct_religious_minority = mean(religious_minority == 1, na.rm = TRUE) * 100,
pct_sexual_minority = mean(sexual_lgbt_minority == 1, na.rm = TRUE) * 100,
pct_disability_minority = mean(disability_minority == 1, na.rm = TRUE) * 100,
pct_other_minority = mean(other_minority == 1, na.rm = TRUE) * 100,
pct_no_minority = mean(none_minority == 1, na.rm = TRUE) * 100,
# discrimination perceptions; we need to be careful with interpretation/ coding
mean_trans_discrim_country = mean(trans_discrimination_country, na.rm = TRUE),
pct_perceive_trans_discrim = mean(trans_discrimination_country %in% c(1,2), na.rm = TRUE) * 100,
pct_experienced_trans_discrim = mean(trans_discrimination_personal == 1, na.rm = TRUE) * 100,
pct_workplace_trans_discrim = mean(trans_discrimination_workplace == 1, na.rm = TRUE) * 100,
# political representation comfort - original scale (1-10, higher = more comfortable)
mean_trans_political_comfort = mean(trans_discrimination_political, na.rm = TRUE),
pct_comfortable_trans_political = mean(trans_discrimination_political >= 7, na.rm = TRUE) * 100,
# effectiveness of discrimination efforts (original scale: 1-10, higher = more effective)
mean_country_discrimination_efforts = mean(country_discrimination_efforts, na.rm = TRUE),
# workplace diversity (1=Yes definitely, 4=No definitely not)
mean_trans_workplace_diversity = mean(trans_workplace_diversity, na.rm = TRUE),
pct_support_trans_workplace_diversity = mean(trans_workplace_diversity %in% c(1,2), na.rm = TRUE) * 100,
# LGBT attitudes - REVERSED for these scales where 1=Totally agree, 4=Totally disagree
# Create reversed versions so higher = more supportive
mean_lgb_rights_rev = mean(case_when(
lgb_rights == 1 ~ 4,
lgb_rights == 2 ~ 3,
lgb_rights == 3 ~ 2,
lgb_rights == 4 ~ 1,
TRUE ~ NA_real_), na.rm = TRUE),
mean_same_sex_relationship_rev = mean(case_when(
same_sex_relationship == 1 ~ 4,
same_sex_relationship == 2 ~ 3,
same_sex_relationship == 3 ~ 2,
same_sex_relationship == 4 ~ 1,
TRUE ~ NA_real_), na.rm = TRUE),
mean_same_sex_marriage_rev = mean(case_when(
same_sex_marriage == 1 ~ 4,
same_sex_marriage == 2 ~ 3,
same_sex_marriage == 3 ~ 2,
same_sex_marriage == 4 ~ 1,
TRUE ~ NA_real_), na.rm = TRUE),
# also calculate percentage who agree with LGBT rights (original values 1 or 2)
pct_support_lgb_rights = mean(lgb_rights %in% c(1,2), na.rm = TRUE) * 100,
pct_support_same_sex_relationship = mean(same_sex_relationship %in% c(1,2), na.rm = TRUE) * 100,
pct_support_same_sex_marriage = mean(same_sex_marriage %in% c(1,2), na.rm = TRUE) * 100,
# school materials (1=Totally agree, 4=Totally disagree)
# Create reversed versions so higher = more supportive
mean_lgb_school_materials_rev = mean(case_when(
lgb_school_materials == 1 ~ 4,
lgb_school_materials == 2 ~ 3,
lgb_school_materials == 3 ~ 2,
lgb_school_materials == 4 ~ 1,
TRUE ~ NA_real_), na.rm = TRUE),
mean_trans_school_materials_rev = mean(case_when(
trans_school_materials == 1 ~ 4,
trans_school_materials == 2 ~ 3,
trans_school_materials == 3 ~ 2,
trans_school_materials == 4 ~ 1,
TRUE ~ NA_real_), na.rm = TRUE),
mean_intersex_school_materials_rev = mean(case_when(
intersex_school_materials == 1 ~ 4,
intersex_school_materials == 2 ~ 3,
intersex_school_materials == 3 ~ 2,
intersex_school_materials == 4 ~ 1,
TRUE ~ NA_real_), na.rm = TRUE),
# also calculate percentage who agree with inclusive school materials
pct_support_lgb_school_materials = mean(lgb_school_materials %in% c(1,2), na.rm = TRUE) * 100,
pct_support_trans_school_materials = mean(trans_school_materials %in% c(1,2), na.rm = TRUE) * 100,
pct_support_intersex_school_materials = mean(intersex_school_materials %in% c(1,2), na.rm = TRUE) * 100,
# comfort with same-sex public displays of affection (1-10)
mean_men_pda_comfort = mean(two_men_public_affection, na.rm = TRUE),
mean_women_pda_comfort = mean(two_women_public_affection, na.rm = TRUE),
pct_comfortable_men_pda = mean(two_men_public_affection >= 7, na.rm = TRUE) * 100,
pct_comfortable_women_pda = mean(two_women_public_affection >= 7, na.rm = TRUE) * 100,
# comfort with transgender colleagues/children (1-10)
mean_trans_colleague_comfort = mean(trans_colleague, na.rm = TRUE),
mean_trans_child_relationship = mean(trans_child_relationship, na.rm = TRUE),
pct_comfortable_trans_colleague = mean(trans_colleague >= 7, na.rm = TRUE) * 100,
pct_comfortable_trans_child = mean(trans_child_relationship >= 7, na.rm = TRUE) * 100,
# support for non-gendered documents
pct_support_nongendered_docs = mean(non_gendered_docs == 1, na.rm = TRUE) * 100,
pct_oppose_nongendered_docs = mean(non_gendered_docs == 2, na.rm = TRUE) * 100,
pct_dk_nongendered_docs = mean(non_gendered_docs == 3, na.rm = TRUE) * 100) %>%
ungroup()
## `summarise()` has grouped output by 'country_name'. You can override using the
## `.groups` argument.
# add country-level variables from the original dataset
# extract country-level variables that don't need to be aggregated
country_level_variables <- df_reduced %>%
select(country_name, gdp_2018, rainbow_score_2019, rainbow_score_2018,
gender_equality_index, Happiness_Score,
v2x_libdem, v2x_egaldem, Regime_type, composite_equality,
z_composite_equality, norm_composite_equality, z_lgbt_support,
norm_lgbt_support, pct_friends_lgbt, z_pct_friends_lgbt,
norm_pct_friends_lgbt, mean_left_right, pct_high_education, region) %>%
distinct()
# merge with aggregated data
df_reduced_aggregated_complete <- df_reduced_aggregated %>%
left_join(country_level_variables, by = "country_name")
country_data_combined_add <- country_data_combined %>%
select(iso2, norm_gdp_growth, norm_Unemployment, pct_trade_support, norm_gdp_2018,
z_gender_equality, z_rainbow_score, z_v2x_libdem, z_v2x_egaldem, z_happiness,
norm_gdp_growth, z_v2x_gender)
df_reduced_aggregated_complete <- df_reduced_aggregated_complete %>%
left_join(country_data_combined_add, by = c("isocntry" = "iso2"))
# Check the final dataset
dim(df_reduced_aggregated_complete)
## [1] 28 100
# save it
write_rds(df_reduced_aggregated_complete, "df_reduced_aggregated_complete")
### create different plots
# calculate correlation matrix for numeric variables
cor_matrix <- cor(select_if(df_reduced_aggregated_complete, is.numeric),
use = "pairwise.complete.obs")
## Warning in cor(select_if(df_reduced_aggregated_complete, is.numeric), use =
## "pairwise.complete.obs"): the standard deviation is zero
# (1) visualize complete correlation matrix
corrplot(cor_matrix, method = "color", type = "upper",
diag = F,
tl.col = "black", tl.srt = 45,
number.cex = 0.7, tl.cex = 0.7)
# (2) create custom function to analyze correlations with outcome
analyze_correlations_with_outcome <- function(data, outcome_var, min_correlation = 0.3) {
# numeric columns only
numeric_data <- data %>% select(where(is.numeric))
# calculate correlations with outcome
cors <- cor(numeric_data, numeric_data[[outcome_var]], use = "pairwise.complete.obs")
# convert to data frame
cor_df <- data.frame(
variable = rownames(cors),
correlation = cors[,1]) %>%
# remove the outcome variable correlating with itself
filter(variable != outcome_var) %>%
# sort by absolute correlation
arrange(desc(abs(correlation)))
# filter by minimum correlation threshold
cor_df <- cor_df %>% filter(abs(correlation) >= min_correlation)
return(cor_df)
}
# run correlation analysis for transgender support
trans_support_correlations <- analyze_correlations_with_outcome(
df_reduced_aggregated_complete, "pct_support_trans", min_correlation = 0.25)
## Warning in cor(numeric_data, numeric_data[[outcome_var]], use =
## "pairwise.complete.obs"): the standard deviation is zero
# display results
print(trans_support_correlations)
## variable
## pct_oppose_trans pct_oppose_trans
## pct_support_nongendered_docs pct_support_nongendered_docs
## pct_oppose_nongendered_docs pct_oppose_nongendered_docs
## pct_support_lgb_school_materials pct_support_lgb_school_materials
## z_lgbt_support z_lgbt_support
## norm_lgbt_support norm_lgbt_support
## pct_support_lgb_rights pct_support_lgb_rights
## mean_lgb_rights_rev mean_lgb_rights_rev
## pct_support_intersex_school_materials pct_support_intersex_school_materials
## pct_support_trans_school_materials pct_support_trans_school_materials
## pct_support_same_sex_marriage pct_support_same_sex_marriage
## pct_support_same_sex_relationship pct_support_same_sex_relationship
## pct_comfortable_trans_colleague pct_comfortable_trans_colleague
## mean_trans_colleague_comfort mean_trans_colleague_comfort
## mean_lgb_school_materials_rev mean_lgb_school_materials_rev
## mean_same_sex_marriage_rev mean_same_sex_marriage_rev
## mean_same_sex_relationship_rev mean_same_sex_relationship_rev
## mean_intersex_school_materials_rev mean_intersex_school_materials_rev
## mean_trans_school_materials_rev mean_trans_school_materials_rev
## mean_men_pda_comfort mean_men_pda_comfort
## norm_pct_friends_lgbt norm_pct_friends_lgbt
## pct_lgb_friends pct_lgb_friends
## pct_friends_lgbt pct_friends_lgbt
## z_pct_friends_lgbt z_pct_friends_lgbt
## pct_comfortable_men_pda pct_comfortable_men_pda
## mean_women_pda_comfort mean_women_pda_comfort
## pct_comfortable_trans_political pct_comfortable_trans_political
## pct_comfortable_women_pda pct_comfortable_women_pda
## mean_trans_political_comfort mean_trans_political_comfort
## mean_trans_child_relationship mean_trans_child_relationship
## pct_comfortable_trans_child pct_comfortable_trans_child
## z_v2x_egaldem z_v2x_egaldem
## v2x_egaldem v2x_egaldem
## composite_equality composite_equality
## z_composite_equality z_composite_equality
## norm_composite_equality norm_composite_equality
## rainbow_score_2018 rainbow_score_2018
## z_v2x_libdem z_v2x_libdem
## v2x_libdem v2x_libdem
## rainbow_score_2019 rainbow_score_2019
## z_rainbow_score z_rainbow_score
## z_gender_equality z_gender_equality
## gender_equality_index gender_equality_index
## pct_right pct_right
## pct_trans_friends pct_trans_friends
## mean_political_ideology mean_political_ideology
## mean_left_right mean_left_right
## Happiness_Score Happiness_Score
## z_happiness z_happiness
## norm_gdp_2018 norm_gdp_2018
## gdp_2018 gdp_2018
## pct_center pct_center
## norm_gdp_growth norm_gdp_growth
## mean_trans_discrim_country mean_trans_discrim_country
## pct_perceive_trans_discrim pct_perceive_trans_discrim
## pct_high_education pct_high_education
## pct_high_educ pct_high_educ
## pct_workplace_trans_discrim pct_workplace_trans_discrim
## pct_financial_difficulty pct_financial_difficulty
## pct_female pct_female
## pct_left pct_left
## mean_age mean_age
## pct_atheist pct_atheist
## pct_trade_support pct_trade_support
## z_v2x_gender z_v2x_gender
## pct_orthodox pct_orthodox
## pct_religious pct_religious
## pct_nonreligious pct_nonreligious
## pct_experienced_trans_discrim pct_experienced_trans_discrim
## pct_urban pct_urban
## pct_support_trans_workplace_diversity pct_support_trans_workplace_diversity
## pct_no_minority pct_no_minority
## pct_large_urban pct_large_urban
## pct_protestant pct_protestant
## pct_other_religion pct_other_religion
## pct_ethnic_minority pct_ethnic_minority
## mean_country_discrimination_efforts mean_country_discrimination_efforts
## correlation
## pct_oppose_trans -1.0000000
## pct_support_nongendered_docs 0.9552632
## pct_oppose_nongendered_docs -0.9552632
## pct_support_lgb_school_materials 0.9099915
## z_lgbt_support 0.9055394
## norm_lgbt_support 0.9055394
## pct_support_lgb_rights 0.9054333
## mean_lgb_rights_rev 0.8873981
## pct_support_intersex_school_materials 0.8853638
## pct_support_trans_school_materials 0.8828781
## pct_support_same_sex_marriage 0.8820719
## pct_support_same_sex_relationship 0.8799193
## pct_comfortable_trans_colleague 0.8795309
## mean_trans_colleague_comfort 0.8697944
## mean_lgb_school_materials_rev 0.8679107
## mean_same_sex_marriage_rev 0.8669420
## mean_same_sex_relationship_rev 0.8656658
## mean_intersex_school_materials_rev 0.8479568
## mean_trans_school_materials_rev 0.8459165
## mean_men_pda_comfort 0.8398369
## norm_pct_friends_lgbt 0.8350491
## pct_lgb_friends 0.8350491
## pct_friends_lgbt 0.8350491
## z_pct_friends_lgbt 0.8350491
## pct_comfortable_men_pda 0.8313883
## mean_women_pda_comfort 0.8219693
## pct_comfortable_trans_political 0.8144761
## pct_comfortable_women_pda 0.8135901
## mean_trans_political_comfort 0.8080428
## mean_trans_child_relationship 0.8008226
## pct_comfortable_trans_child 0.7938314
## z_v2x_egaldem 0.7909474
## v2x_egaldem 0.7909474
## composite_equality 0.7752634
## z_composite_equality 0.7752634
## norm_composite_equality 0.7752634
## rainbow_score_2018 0.7539898
## z_v2x_libdem 0.7147897
## v2x_libdem 0.7147897
## rainbow_score_2019 0.7094428
## z_rainbow_score 0.7094428
## z_gender_equality 0.7035475
## gender_equality_index 0.7035475
## pct_right -0.6890892
## pct_trans_friends 0.6648839
## mean_political_ideology -0.6422553
## mean_left_right -0.6422553
## Happiness_Score 0.6310032
## z_happiness 0.6310032
## norm_gdp_2018 0.5771817
## gdp_2018 0.5771817
## pct_center 0.5523491
## norm_gdp_growth -0.5522688
## mean_trans_discrim_country -0.5172765
## pct_perceive_trans_discrim 0.5008079
## pct_high_education 0.4980007
## pct_high_educ 0.4948097
## pct_workplace_trans_discrim 0.4664769
## pct_financial_difficulty -0.4504279
## pct_female -0.4496147
## pct_left 0.4253939
## mean_age 0.4017389
## pct_atheist 0.4001147
## pct_trade_support 0.3991507
## z_v2x_gender 0.3977959
## pct_orthodox -0.3976753
## pct_religious -0.3825894
## pct_nonreligious 0.3724599
## pct_experienced_trans_discrim -0.3707182
## pct_urban 0.3530711
## pct_support_trans_workplace_diversity 0.3494914
## pct_no_minority 0.3146637
## pct_large_urban -0.3090152
## pct_protestant 0.3082064
## pct_other_religion 0.2785021
## pct_ethnic_minority -0.2752098
## mean_country_discrimination_efforts 0.2604523
# create a visualization of top correlations
ggplot(trans_support_correlations %>% head(15),
aes(x = reorder(variable, correlation), y = correlation)) +
geom_bar(stat = "identity", aes(fill = correlation > 0)) +
coord_flip() +
labs(title = "Top correlated variables of transgender rights support",
subtitle = "Country-level aggregated data",
x = "Variables",
y = "Correlation with % supporting transgender document changes") +
theme_minimal() +
scale_fill_manual(values = c("firebrick", "steelblue"),
name = "Direction",
labels = c("Negative", "Positive"))
# (3) create correlation matrix for key variables
key_vars <- c("pct_support_trans",
trans_support_correlations$variable[1:25], # top 25 correlates
"gdp_2018", "rainbow_score_2019", "gender_equality_index")
key_cor_matrix <- cor(df_reduced_aggregated_complete[, key_vars],
use = "pairwise.complete.obs")
corrplot(key_cor_matrix, method = "color",
order = "hclust", diag = F,
tl.col = "black", tl.srt = 45,
number.cex = 0.6, tl.cex = 0.7)
# (4) create correlation matrix for all country variables
key_vars_country <- c("pct_support_trans", "norm_gdp_2018", "rainbow_score_2019","z_gender_equality", "norm_Unemployment", "norm_gdp_growth", "z_v2x_libdem", "z_v2x_egaldem","z_happiness", "z_v2x_gender")
key_cor_matrix_country <- cor(df_reduced_aggregated_complete[, key_vars_country],
use = "pairwise.complete.obs")
corrplot(key_cor_matrix_country, method = "color",
order = "hclust", diag = F, addCoef.col = "black",
tl.col = "black", tl.srt = 45,
number.cex = 0.6, tl.cex = 0.7)
###
# regional comparisons
region_summary <- df_reduced_aggregated_complete %>%
group_by(region) %>%
summarize(
n_countries = n(),
avg_trans_support = mean(pct_support_trans),
min_support = min(pct_support_trans),
max_support = max(pct_support_trans),
std_dev = sd(pct_support_trans)) %>%
arrange(desc(avg_trans_support))
# visualize regional differences
ggplot(df_reduced_aggregated_complete, aes(x = reorder(region, pct_support_trans, FUN = mean),
y = pct_support_trans)) +
geom_boxplot(aes(fill = region)) +
geom_jitter(width = 0.2, alpha = 0.5) +
coord_flip() +
labs(title = "Support for Transgender Rights by European Region",
x = "Region",
y = "% Supporting Transgender Rights") +
theme_minimal() +
theme(legend.position = "none")
### some regression
# create a scatterplot matrix with regression lines
ggplot(df_reduced_aggregated_complete,
aes(x = pct_nonreligious, y = pct_support_trans)) +
geom_point(aes(size = rainbow_score_2019, color = region)) +
geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 10) +
geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
labs(title = "Relationship Between Non-religiosity and Transgender Rights Support",
x = "% Non-religious Population",
y = "% Supporting Transgender Rights",
size = "Rainbow Score",
color = "Region") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(df_reduced_aggregated_complete,
aes(x = gender_equality_index, y = pct_support_trans)) +
geom_point(aes(size = rainbow_score_2019, color = region)) +
geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 10) +
geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
labs(title = "Gender Equality and Transgender Rights Support",
x = "Gender Equality Index",
y = "% Supporting Transgender Rights",
size = "Rainbow Score",
color = "Region") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
### clusters
cluster_vars <- c("pct_support_trans", "norm_gdp_2018",
"rainbow_score_2019", "gender_equality_index",
"v2x_libdem", "pct_catholic", "pct_nonreligious")
# Prepare data
cluster_data <- df_reduced_aggregated_complete %>%
select(country_name, all_of(cluster_vars)) %>%
na.omit() %>%
column_to_rownames("country_name")
# Scale the data
cluster_data_scaled <- scale(cluster_data)
# Compute distance matrix
dist_matrix <- dist(cluster_data_scaled, method = "euclidean")
# Hierarchical clustering
hc <- hclust(dist_matrix, method = "ward.D2")
# Plot dendrogram
plot(hc, main = "Clustering of Countries by Support for Transgender Rights",
sub = "", xlab = "", cex = 0.7)
rect.hclust(hc, k = 4, border = "red")
### summary table
country_summary <- df_reduced_aggregated_complete %>%
select(country_name, pct_support_trans, rainbow_score_2019,
gender_equality_index, norm_gdp_2018,
pct_nonreligious, v2x_libdem, gdp_2018) %>%
arrange(desc(pct_support_trans))
kable(country_summary,
col.names = c("Country", "Trans Rights Support (%)", "Rainbow Score",
"Gender Equality", "LGBT Comfort",
"Non-religious (%)", "Liberal Democracy", "GDP 2018"),
digits = 1) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
row_spec(0, bold = TRUE) %>%
add_header_above(c(" " = 1, "Support Measures" = 2, "Equality Measures" = 2,
"Social Factors" = 2, "Economic" = 1))
| Country | Trans Rights Support (%) | Rainbow Score | Gender Equality | LGBT Comfort | Non-religious (%) | Liberal Democracy | GDP 2018 |
|---|---|---|---|---|---|---|---|
| Spain | 89.1 | 58.5 | 68.3 | 0.2 | 21.7 | 0.8 | 41073.7 |
| Netherlands | 86.8 | 49.7 | 72.9 | 0.4 | 47.6 | 0.8 | 58818.0 |
| Malta | 82.2 | 74.7 | 60.1 | 0.3 | 4.8 | 0.6 | 48127.7 |
| Denmark | 79.5 | 60.3 | 76.8 | 0.4 | 14.2 | 0.9 | 57231.3 |
| France | 79.0 | 62.7 | 72.6 | 0.2 | 22.0 | 0.8 | 46397.5 |
| Luxembourg | 79.0 | 41.3 | 69.0 | 1.0 | 20.8 | 0.8 | 116334.0 |
| Germany | 78.9 | 47.5 | 65.5 | 0.3 | 28.1 | 0.8 | 56272.9 |
| Portugal | 75.4 | 57.5 | 56.0 | 0.1 | 9.7 | 0.8 | 34725.2 |
| Ireland | 74.1 | 44.7 | 69.5 | 0.7 | 10.0 | 0.8 | 86433.7 |
| Sweden | 73.1 | 52.9 | 82.6 | 0.3 | 37.2 | 0.9 | 53122.5 |
| Belgium | 73.0 | 70.4 | 70.5 | 0.3 | 24.9 | 0.8 | 52466.5 |
| Finland | 71.9 | 62.2 | 73.0 | 0.3 | 17.0 | 0.8 | 49242.7 |
| United Kingdom | 70.8 | 67.6 | 69.6 | 0.2 | 29.4 | 0.8 | 47108.0 |
| Austria | 61.8 | 48.5 | 63.3 | 0.4 | 21.5 | 0.8 | 56654.5 |
| Greece | 59.9 | 44.8 | 50.0 | 0.1 | 1.8 | 0.8 | 29792.0 |
| Estonia | 59.1 | 34.4 | 56.7 | 0.1 | 36.4 | 0.8 | 37201.1 |
| Cyprus | 53.5 | 22.0 | 55.1 | 0.2 | 2.0 | 0.7 | 40922.7 |
| Slovenia | 52.8 | 39.8 | 68.4 | 0.2 | 6.8 | 0.7 | 38656.2 |
| Italy | 49.9 | 19.4 | 62.1 | 0.2 | 12.2 | 0.8 | 43583.4 |
| Poland | 47.2 | 18.1 | 56.8 | 0.1 | 4.6 | 0.5 | 32360.9 |
| Latvia | 46.5 | 16.8 | 57.9 | 0.1 | 20.1 | 0.7 | 29832.1 |
| Croatia | 44.8 | 45.8 | 53.1 | 0.1 | 7.3 | 0.6 | 29043.9 |
| Czech Republic | 43.8 | 25.5 | 53.6 | 0.2 | 48.2 | 0.7 | 42016.4 |
| Lithuania | 41.7 | 19.9 | 56.8 | 0.1 | 7.0 | 0.8 | 36491.9 |
| Slovakia | 28.2 | 29.0 | 52.4 | 0.1 | 8.5 | 0.7 | 31514.2 |
| Romania | 23.6 | 20.7 | 52.4 | 0.1 | 1.6 | 0.6 | 29569.6 |
| Bulgaria | 16.7 | 25.8 | 58.0 | 0.0 | 4.3 | 0.5 | 24052.2 |
| Hungary | 16.2 | 42.8 | 50.8 | 0.1 | 17.4 | 0.4 | 32258.3 |
### check some bivariate relationships; choose three key demographic variblaes
# (1) age
ggplot(df_reduced, aes(x = age, y = as.numeric(qc19 == 1))) +
geom_jitter(alpha = 0.1, height = 0.05) +
geom_smooth(method = "loess", se = TRUE) +
labs(title = "Relationship Between Age and Transgender Rights Support",
x = "Age", y = "Probability of Support") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# looks like age has a non-linear effect as the curve is flat for ages until ~60 and then falls off
# this suggests we should use a quadratic term for age
# (2) political ideology
ggplot(df_reduced, aes(x = political_ideology, y = as.numeric(qc19 == 1))) +
geom_jitter(alpha = 0.1, height = 0.05) +
geom_smooth(method = "loess", se = TRUE) +
labs(title = "Relationship Between Political Ideology and Transgender Rights Support",
x = "Political Ideology (Left to Right)",
y = "Probability of Support") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# (3a) religion
ggplot(df_reduced_aggregated_complete, aes(x = pct_nonreligious, y = pct_support_trans)) +
geom_point(aes(size = rainbow_score_2019, color = region), alpha = 0.8) +
geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 15) +
geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
labs(title = "Relationship Between Non-religiosity and Transgender Rights Support",
subtitle = "Country-level data from Special Eurobarometer 493 (2019)",
x = "% Non-religious Population",
y = "% Supporting Transgender Document Changes",
size = "Rainbow Score",
color = "Region") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# (3b) religion disaggregated
religion_support <- df_reduced %>%
mutate(religion_group = case_when(
religion == 1 ~ "Catholic",
religion == 2 ~ "Orthodox",
religion == 3 ~ "Protestant",
religion == 4 ~ "Other Christian",
religion %in% c(6:8) ~ "Muslim",
religion %in% c(9:11) ~ "Other Religion",
religion == 13 ~ "Atheist",
religion == 14 ~ "Non-believer",
TRUE ~ "Other/Missing")) %>%
group_by(religion_group) %>%
summarize(
support_rate = mean(qc19 == 1, na.rm = TRUE) * 100,
sample_size = n(),
se = sqrt((support_rate/100 * (1-support_rate/100)) / sample_size) * 100) %>%
filter(religion_group != "Other/Missing", sample_size >= 30) %>%
arrange(desc(support_rate))
ggplot(religion_support, aes(x = reorder(religion_group, support_rate), y = support_rate)) +
geom_bar(stat = "identity", fill = "steelblue", width = 0.7) +
geom_errorbar(aes(ymin = support_rate - 1.96*se,
ymax = support_rate + 1.96*se),
width = 0.2) +
geom_text(aes(label = sprintf("%.1f%%", support_rate), y = support_rate + 3),
vjust = 0) +
geom_text(aes(label = paste0("n=", sample_size), y = -5),
vjust = 1, size = 3) +
coord_flip() +
labs(title = "Support for Transgender Rights by Religious Affiliation",
subtitle = "Percentage supporting transgender document changes by religion",
x = NULL,
y = "Support Rate (%)") +
ylim(-5, max(religion_support$support_rate) + 10) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold"),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
axis.text.y = element_text(face = "bold"))
Based on the results of our descriptive analysis, we should use the following variables in the multilevel regression model: z_rainbow_score, z_gender_equality, regions, z_v2x_libdem, z_v2z_egaldem, norm_gdp_2018
### use descriptive ML for predictor selection
# prepare data for random forest by selecting variables we considered before
df_reduced$region <- as.factor(df_reduced$region)
df_reduced$Regime_type <- as.factor(df_reduced$Regime_type)
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 1, 1, ifelse(df_reduced$qc19 == 2, 0, NA)) %>%
as.factor() # convert target to binary
# remove NAs due to the coding before
df_reduced <- df_reduced %>%
na.omit()
# join
df_reduced_rf <- df_reduced %>%
left_join(country_data_combined, by = "country_name")
model_data <- df_reduced_rf %>%
select(region.x, z_v2x_libdem, z_v2x_egaldem, z_v2x_freexp, z_v2x_gender, z_v2x_liberal,
z_gender_equality_index, z_rainbow_score_2019, norm_gdp_2018, norm_Unemployment,
norm_Happiness_Score, norm_gdp_growth, z_Functioning_of_government,
z_Electoral_process_and_pluralism, qc19)
# split data into training and testing
set.seed(69420)
train_index <- createDataPartition(model_data$qc19, p = 0.7, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]
# train Random Forest model
rf_model <- ranger(
formula = qc19 ~ .,
data = train_data,
importance = "impurity",
num.trees = 500,
mtry = floor(sqrt(ncol(train_data) - 1)),
probability = TRUE
)
# extract variable importance correctly from ranger
var_importance <- rf_model$variable.importance
# ensure it is a numeric named vector before converting to a data frame
var_importance_df <- data.frame(
Variable = names(var_importance),
Importance = as.numeric(var_importance)
)
# sort the data frame by importance in descending order
var_importance_df <- var_importance_df[order(var_importance_df$Importance, decreasing = TRUE), ]
# plot variable importance
ggplot(var_importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal() +
labs(
title = "Variable Importance from Random Forest",
x = "Variables",
y = "Importance"
)
We see similar variables are most important using descriptive ML, reinforcing our previous decision.
#Load data
data <- read_dta("data/raw/ZA7575.dta")
df_reduced <- readRDS("df_reduced.rds")
# restore the old NAs and do CCA
df_reduced$qc19 <- data$qc19
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 3, NA, df_reduced$qc19)
df_reduced <- df_reduced %>%
drop_na(qc19)
# quick data check
table(df_reduced$country_name)
##
## Austria Belgium Bulgaria Croatia Cyprus
## 957 1014 738 912 399
## Czech Republic Denmark Estonia Finland France
## 919 930 803 883 915
## Germany Greece Hungary Ireland Italy
## 1333 923 903 882 889
## Latvia Lithuania Luxembourg Malta Netherlands
## 847 851 468 448 980
## Poland Portugal Romania Slovakia Slovenia
## 818 911 905 886 920
## Spain Sweden United Kingdom
## 921 902 901
# Calculate the percentage of "Yes" (1) responses for qc19 by country
country_means <- aggregate(qc19 == 1 ~ country_name, data = df_reduced, FUN = mean)
# Multiply by 100 to get percentage and then sort
country_means$percentage <- country_means$`qc19 == 1` * 100
country_means <- country_means[order(country_means$percentage), ]
# Create the plot
ggplot(country_means, aes(x = reorder(country_name, percentage), y = percentage)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(title = "Support for Transgender Rights by Country",
subtitle = "Percentage answering 'Yes' to allowing transgender people to change civil documents",
x = "Country",
y = "Support (%)") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# scale some of the variables (both categorical and continuous) with larger scales
df_reduced <- df_reduced %>%
mutate(
age_z = scale(age),
religion_z = scale(as.numeric(religion)),
political_ideology_z = scale(political_ideology),
respondent_cooperation_z = scale(as.numeric(respondent_cooperation)),
rainbow_score_z = scale(rainbow_score_2019),
v2x_egaldem_z = scale(v2x_egaldem))
# Recode qc19 from 1 and 2 to 0 and 1 for the binary model
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 1, 1, 0)
# Optimize the settings to avoid convergence problems in the glmer() functions
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 100000))
# Recode some of the variables such that higher values translate to a more positive
# attitude towards LGBT rights in general -> simplieies coefficient interpretation
df_reduced <- df_reduced %>%
mutate(trans_friends = ifelse(trans_friends == 1, 2, 1), # trans_friends is sd1_7 (see Reduced dataset.R)
lgb_friends = ifelse(lgb_friends == 1, 2, 1)) # lgb_friends is sd1_4 (see Reduced dataset.R)
# STEP 1: Null Model (Intercept-only model)
null_model <- glmer(qc19 ~ 1 + (1|country_name), family = "binomial", data = df_reduced)
summary(null_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: qc19 ~ 1 + (1 | country_name)
## Data: df_reduced
##
## AIC BIC logLik deviance df.resid
## 28212.6 28228.8 -14104.3 28208.6 24156
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0233 -0.9008 0.5066 0.6531 2.2755
##
## Random effects:
## Groups Name Variance Std.Dev.
## country_name (Intercept) 0.9649 0.9823
## Number of obs: 24158, groups: country_name, 28
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4495 0.1856 2.422 0.0154 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate ICC
performance::icc(null_model)
## # Intraclass Correlation Coefficient
##
## Adjusted ICC: 0.227
## Unadjusted ICC: 0.227
The null model indicates a statistically significant baseline probability of support for transgender individuals obtaining official documents (intercept = 0.45, p = 0.015), with substantial country-level variation accounting for 22.7% of the total variance in this support (ICC = 0.227).
# STEP 2: Add Individual-Level Predictors (Level 1)
# starting with predictors that showed correlation with qc19
model1 <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1|country_name),
family = "binomial",
control = control,
data = df_reduced)
summary(model1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +
## trans_friends + respondent_cooperation_z + (1 | country_name)
## Data: df_reduced
## Control: control
##
## AIC BIC logLik deviance df.resid
## 26411.0 26483.8 -13196.5 26393.0 24149
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.1228 -0.7612 0.3617 0.6576 4.7443
##
## Random effects:
## Groups Name Variance Std.Dev.
## country_name (Intercept) 0.6956 0.834
## Number of obs: 24158, groups: country_name, 28
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.79076 0.17981 -9.959 < 2e-16 ***
## age_z -0.08163 0.01613 -5.060 4.20e-07 ***
## gender 0.38278 0.03068 12.477 < 2e-16 ***
## religion_z 0.10468 0.01767 5.924 3.14e-09 ***
## political_ideology_z -0.13256 0.01552 -8.543 < 2e-16 ***
## lgb_friends 0.88187 0.03648 24.173 < 2e-16 ***
## trans_friends 0.40647 0.06001 6.773 1.26e-11 ***
## respondent_cooperation_z -0.27127 0.01610 -16.845 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age_z gender rlgn_z pltc__ lgb_fr trns_f
## age_z -0.070
## gender -0.246 -0.022
## religion_z 0.014 0.119 0.098
## pltcl_dlgy_ -0.021 0.045 0.007 0.070
## lgb_friends -0.187 0.211 -0.043 -0.087 0.032
## trans_frnds -0.297 0.036 -0.005 -0.025 0.015 -0.224
## rspndnt_cp_ -0.021 -0.078 -0.023 -0.004 -0.005 0.068 0.017
# compare with the null_model
anova(null_model, model1)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## null_model 2 28213 28229 -14104 28209
## model1 9 26411 26484 -13196 26393 1815.7 7 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
list("Model 1" = null_model,
"Model 2" = model1),
stars = TRUE,
gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
| Model 1 | Model 2 | |
|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | ||
| (Intercept) | 0.449* | -1.791*** |
| (0.186) | (0.180) | |
| age_z | -0.082*** | |
| (0.016) | ||
| gender | 0.383*** | |
| (0.031) | ||
| religion_z | 0.105*** | |
| (0.018) | ||
| political_ideology_z | -0.133*** | |
| (0.016) | ||
| lgb_friends | 0.882*** | |
| (0.036) | ||
| trans_friends | 0.406*** | |
| (0.060) | ||
| respondent_cooperation_z | -0.271*** | |
| (0.016) | ||
| SD (Intercept country_name) | 0.982 | 0.834 |
| Num.Obs. | 24158 | 24158 |
| AIC | 28212.6 | 26411.0 |
| BIC | 28228.8 | 26483.8 |
Model 1 significantly improves fit over the null model (χ² = 1720.5, p < 0.001, lower AIC/BIC). Transgender documentation is positively associated with characteristics such as being female and having LGB/transgender friends, while negatively associated with age and respondent cooperation during the survey.
# STEP 3: Add Country-Level Predictors (Level 2)
# We test three different models based on 'themes'. As we only have 28 level-2 units,
# we should not add more than 2-3 country-level variables to each model --> overfitting
# Political/ Economic development model
model_econ_pol <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +
trans_friends + respondent_cooperation_z +
scale(gdp_2018) + z_functioning_of_government +
(1 + lgb_friends|country_name),
family = "binomial", data = df_reduced)
# Social wellbeing model
model_wellbeing <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +
trans_friends + respondent_cooperation_z +
scale(Happiness_Score) + scale(gender_equality_index) +
(1 + lgb_friends|country_name),
family = "binomial", data = df_reduced)
# LGBTQ/equality values model
model_equality <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z +
rainbow_score_z + v2x_egaldem_z + (1|country_name),
family = "binomial",
control = control,
data = df_reduced)
# decide which model is the best
anova(model_econ_pol, model_wellbeing, model_equality) # equality is the best
## Data: df_reduced
## Models:
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model_econ_pol: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + scale(gdp_2018) + z_functioning_of_government + (1 + lgb_friends | country_name)
## model_wellbeing: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + scale(Happiness_Score) + scale(gender_equality_index) + (1 + lgb_friends | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model_equality 11 26382 26471 -13180 26360
## model_econ_pol 13 26401 26506 -13188 26375 0 2 1
## model_wellbeing 13 26404 26509 -13189 26378 0 0
# check whether the equality model can be improved
model_equality_2 <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z +
rainbow_score_z + v2x_egaldem_z + z_functioning_of_government +
(1|country_name),
family = "binomial",
control = control,
data = df_reduced)
anova(model_equality, model_equality_2) # no improvement --> keep model_equality
## Data: df_reduced
## Models:
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model_equality_2: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + z_functioning_of_government + (1 | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model_equality 11 26382 26471 -13180 26360
## model_equality_2 12 26383 26480 -13180 26359 1.0893 1 0.2966
# test for multicollinearity among the two cultural country-level variables
vif_model <- lm(qc19 ~ rainbow_score_z + v2x_egaldem_z, data = df_reduced)
vif(vif_model) # no multicollinearity issues --> keep model_equality as it is
## rainbow_score_z v2x_egaldem_z
## 1.179562 1.179562
# compare with the previous two models
anova(null_model, model1, model_equality)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## null_model 2 28213 28229 -14104 28209
## model1 9 26411 26484 -13196 26393 1815.669 7 < 2.2e-16 ***
## model_equality 11 26382 26471 -13180 26360 32.612 2 8.285e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
list("Model 1" = null_model,
"Model 2" = model1,
"Model 3" = model_equality),
stars = TRUE,
gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
| Model 1 | Model 2 | Model 3 | |
|---|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |||
| (Intercept) | 0.449* | -1.791*** | -1.763*** |
| (0.186) | (0.180) | (0.122) | |
| age_z | -0.082*** | -0.083*** | |
| (0.016) | (0.016) | ||
| gender | 0.383*** | 0.384*** | |
| (0.031) | (0.031) | ||
| religion_z | 0.105*** | 0.102*** | |
| (0.018) | (0.018) | ||
| political_ideology_z | -0.133*** | -0.132*** | |
| (0.016) | (0.016) | ||
| lgb_friends | 0.882*** | 0.876*** | |
| (0.036) | (0.036) | ||
| trans_friends | 0.406*** | 0.407*** | |
| (0.060) | (0.060) | ||
| respondent_cooperation_z | -0.271*** | -0.269*** | |
| (0.016) | (0.016) | ||
| rainbow_score_z | 0.354*** | ||
| (0.091) | |||
| v2x_egaldem_z | 0.480*** | ||
| (0.095) | |||
| SD (Intercept country_name) | 0.982 | 0.834 | 0.458 |
| Num.Obs. | 24158 | 24158 | 24158 |
| AIC | 28212.6 | 26411.0 | 26382.4 |
| BIC | 28228.8 | 26483.8 | 26471.4 |
Model 2 improves fit yet again by adding country-level predictors (χ² = 33.7, p < 0.001, AIC reduced by 29.7 points). The strongest predictors are LGB friends (0.88), egalitarian democracy (0.49), transgender friends (0.42), and LGBTQ legal protections (rainbow score, 0.37), while individual-level predictors remained consistent with Model 1.
# STEP 4: Add Random Slopes for Individual-Level Predictors
# Let's add random slopes for lgb_friends which has shown a strong effect
model3 <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z +
rainbow_score_z + v2x_egaldem_z +
(1 + lgb_friends|country_name),
family = "binomial",
data = df_reduced)
summary(model3)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +
## trans_friends + respondent_cooperation_z + rainbow_score_z +
## v2x_egaldem_z + (1 + lgb_friends | country_name)
## Data: df_reduced
##
## AIC BIC logLik deviance df.resid
## 26379.2 26484.4 -13176.6 26353.2 24145
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.8970 -0.7605 0.3654 0.6560 4.7324
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## country_name (Intercept) 0.28167 0.5307
## lgb_friends 0.02986 0.1728 -0.53
## Number of obs: 24158, groups: country_name, 28
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.75034 0.13324 -13.136 < 2e-16 ***
## age_z -0.08369 0.01617 -5.177 2.25e-07 ***
## gender 0.38596 0.03072 12.565 < 2e-16 ***
## religion_z 0.10190 0.01769 5.761 8.36e-09 ***
## political_ideology_z -0.13125 0.01556 -8.437 < 2e-16 ***
## lgb_friends 0.87288 0.05021 17.385 < 2e-16 ***
## trans_friends 0.40446 0.06011 6.728 1.72e-11 ***
## respondent_cooperation_z -0.26967 0.01612 -16.734 < 2e-16 ***
## rainbow_score_z 0.36167 0.09015 4.012 6.02e-05 ***
## v2x_egaldem_z 0.47996 0.09428 5.091 3.57e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age_z gender rlgn_z pltc__ lgb_fr trns_f rspn__ rnbw__
## age_z -0.098
## gender -0.332 -0.022
## religion_z 0.018 0.120 0.098
## pltcl_dlgy_ -0.031 0.046 0.007 0.070
## lgb_friends -0.466 0.158 -0.029 -0.059 0.027
## trans_frnds -0.400 0.036 -0.005 -0.025 0.015 -0.166
## rspndnt_cp_ -0.028 -0.078 -0.024 -0.003 -0.004 0.048 0.018
## ranbw_scr_z 0.030 -0.032 0.013 -0.003 0.014 -0.029 -0.008 0.001
## v2x_egldm_z 0.006 -0.012 0.012 -0.020 0.011 -0.008 0.012 0.011 -0.346
# compare with previous three models
anova(null_model, model1, model_equality, model3)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model3: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 + lgb_friends | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## null_model 2 28213 28229 -14104 28209
## model1 9 26411 26484 -13196 26393 1815.6686 7 < 2.2e-16 ***
## model_equality 11 26382 26471 -13180 26360 32.6125 2 8.285e-08 ***
## model3 13 26379 26484 -13177 26353 7.1246 2 0.02837 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
list("Model 1" = null_model,
"Model 2" = model1,
"Model 3" = model_equality,
"Model 4" = model3),
stars = TRUE,
gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
| Model 1 | Model 2 | Model 3 | Model 4 | |
|---|---|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | ||||
| (Intercept) | 0.449* | -1.791*** | -1.763*** | -1.750*** |
| (0.186) | (0.180) | (0.122) | (0.133) | |
| age_z | -0.082*** | -0.083*** | -0.084*** | |
| (0.016) | (0.016) | (0.016) | ||
| gender | 0.383*** | 0.384*** | 0.386*** | |
| (0.031) | (0.031) | (0.031) | ||
| religion_z | 0.105*** | 0.102*** | 0.102*** | |
| (0.018) | (0.018) | (0.018) | ||
| political_ideology_z | -0.133*** | -0.132*** | -0.131*** | |
| (0.016) | (0.016) | (0.016) | ||
| lgb_friends | 0.882*** | 0.876*** | 0.873*** | |
| (0.036) | (0.036) | (0.050) | ||
| trans_friends | 0.406*** | 0.407*** | 0.404*** | |
| (0.060) | (0.060) | (0.060) | ||
| respondent_cooperation_z | -0.271*** | -0.269*** | -0.270*** | |
| (0.016) | (0.016) | (0.016) | ||
| rainbow_score_z | 0.354*** | 0.362*** | ||
| (0.091) | (0.090) | |||
| v2x_egaldem_z | 0.480*** | 0.480*** | ||
| (0.095) | (0.094) | |||
| SD (Intercept country_name) | 0.982 | 0.834 | 0.458 | 0.531 |
| SD (lgb_friends country_name) | 0.173 | |||
| Cor (Intercept~lgb_friends country_name) | -0.532 | |||
| Num.Obs. | 24158 | 24158 | 24158 | 24158 |
| AIC | 28212.6 | 26411.0 | 26382.4 | 26379.2 |
| BIC | 28228.8 | 26483.8 | 26471.4 | 26484.4 |
Model 3 slightly improves fit by adding random slopes for LGB friends (χ² = 7.92, p = 0.019). There is modest cross-country variation in the effect of having LGB friends (SD = 0.176), with a negative correlation (-0.48) between intercepts and slopes indicating stronger effects in countries with lower baseline support.
# STEP 5: Add Cross-Level Interactions
# Assuming the random slope for lgb_friends is significant
# We'll test cross-level interactions with both country-level predictors
model4a <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z +
rainbow_score_z + v2x_egaldem_z +
lgb_friends:rainbow_score_z +
(1 + lgb_friends|country_name),
family = "binomial", data = df_reduced)
summary(model4a)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +
## trans_friends + respondent_cooperation_z + rainbow_score_z +
## v2x_egaldem_z + lgb_friends:rainbow_score_z + (1 + lgb_friends |
## country_name)
## Data: df_reduced
##
## AIC BIC logLik deviance df.resid
## 26381.2 26494.5 -13176.6 26353.2 24144
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.8950 -0.7604 0.3654 0.6560 4.7324
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## country_name (Intercept) 0.28176 0.5308
## lgb_friends 0.02985 0.1728 -0.53
## Number of obs: 24158, groups: country_name, 28
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7502822 0.1332809 -13.132 < 2e-16 ***
## age_z -0.0836921 0.0161660 -5.177 2.25e-07 ***
## gender 0.3859664 0.0307213 12.563 < 2e-16 ***
## religion_z 0.1019082 0.0176897 5.761 8.37e-09 ***
## political_ideology_z -0.1312573 0.0155631 -8.434 < 2e-16 ***
## lgb_friends 0.8728531 0.0502206 17.380 < 2e-16 ***
## trans_friends 0.4044711 0.0601146 6.728 1.72e-11 ***
## respondent_cooperation_z -0.2696768 0.0161184 -16.731 < 2e-16 ***
## rainbow_score_z 0.3625479 0.1144465 3.168 0.00154 **
## v2x_egaldem_z 0.4799307 0.0943035 5.089 3.60e-07 ***
## lgb_friends:rainbow_score_z -0.0005866 0.0481628 -0.012 0.99028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age_z gender rlgn_z pltc__ lgb_fr trns_f rspn__ rnbw__
## age_z -0.098
## gender -0.331 -0.022
## religion_z 0.018 0.120 0.098
## pltcl_dlgy_ -0.031 0.046 0.007 0.070
## lgb_friends -0.466 0.159 -0.030 -0.059 0.028
## trans_frnds -0.400 0.036 -0.005 -0.025 0.015 -0.167
## rspndnt_cp_ -0.029 -0.078 -0.024 -0.003 -0.003 0.049 0.017
## ranbw_scr_z 0.038 -0.026 0.020 0.006 -0.007 -0.038 -0.002 -0.012
## v2x_egldm_z 0.006 -0.012 0.012 -0.020 0.012 -0.007 0.012 0.011 -0.286
## lgb_frnd:__ -0.024 0.002 -0.016 -0.013 0.029 0.025 -0.008 0.020 -0.616
## v2x_g_
## age_z
## gender
## religion_z
## pltcl_dlgy_
## lgb_friends
## trans_frnds
## rspndnt_cp_
## ranbw_scr_z
## v2x_egldm_z
## lgb_frnd:__ 0.022
# Alternative interaction with v2x_egaldem
model4b <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z +
rainbow_score_z+ v2x_egaldem_z +
lgb_friends:v2x_egaldem_z +
(1 + lgb_friends|country_name),
family = "binomial",
data = df_reduced)
summary(model4b)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +
## trans_friends + respondent_cooperation_z + rainbow_score_z +
## v2x_egaldem_z + lgb_friends:v2x_egaldem_z + (1 + lgb_friends |
## country_name)
## Data: df_reduced
##
## AIC BIC logLik deviance df.resid
## 26380.7 26494.0 -13176.3 26352.7 24144
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.9289 -0.7612 0.3665 0.6548 4.7850
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## country_name (Intercept) 0.28103 0.5301
## lgb_friends 0.02859 0.1691 -0.53
## Number of obs: 24158, groups: country_name, 28
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.74930 0.13319 -13.134 < 2e-16 ***
## age_z -0.08385 0.01617 -5.187 2.14e-07 ***
## gender 0.38602 0.03072 12.566 < 2e-16 ***
## religion_z 0.10193 0.01769 5.763 8.24e-09 ***
## political_ideology_z -0.13156 0.01556 -8.453 < 2e-16 ***
## lgb_friends 0.87464 0.04979 17.568 < 2e-16 ***
## trans_friends 0.40301 0.06009 6.706 1.99e-11 ***
## respondent_cooperation_z -0.26977 0.01612 -16.738 < 2e-16 ***
## rainbow_score_z 0.36184 0.09011 4.016 5.93e-05 ***
## v2x_egaldem_z 0.53462 0.11836 4.517 6.27e-06 ***
## lgb_friends:v2x_egaldem_z -0.03826 0.05046 -0.758 0.448
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age_z gender rlgn_z pltc__ lgb_fr trns_f rspn__ rnbw__
## age_z -0.098
## gender -0.332 -0.022
## religion_z 0.018 0.120 0.098
## pltcl_dlgy_ -0.031 0.047 0.007 0.070
## lgb_friends -0.464 0.159 -0.030 -0.059 0.026
## trans_frnds -0.400 0.037 -0.005 -0.025 0.016 -0.169
## rspndnt_cp_ -0.028 -0.077 -0.024 -0.003 -0.004 0.048 0.018
## ranbw_scr_z 0.030 -0.032 0.012 -0.003 0.014 -0.029 -0.008 0.001
## v2x_egldm_z 0.019 -0.022 0.012 -0.016 -0.010 0.008 -0.010 0.004 -0.269
## lgb_frn:2__ -0.012 0.013 -0.004 -0.002 0.026 -0.046 0.033 0.009 -0.005
## v2x_g_
## age_z
## gender
## religion_z
## pltcl_dlgy_
## lgb_friends
## trans_frnds
## rspndnt_cp_
## ranbw_scr_z
## v2x_egldm_z
## lgb_frn:2__ -0.616
# compare with previous models
anova(null_model, model1, model_equality, model3, model4a, model4b)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model3: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 + lgb_friends | country_name)
## model4a: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:rainbow_score_z + (1 + lgb_friends | country_name)
## model4b: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:v2x_egaldem_z + (1 + lgb_friends | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## null_model 2 28213 28229 -14104 28209
## model1 9 26411 26484 -13196 26393 1815.6686 7 < 2.2e-16 ***
## model_equality 11 26382 26471 -13180 26360 32.6125 2 8.285e-08 ***
## model3 13 26379 26484 -13177 26353 7.1246 2 0.02837 *
## model4a 14 26381 26494 -13177 26353 0.0001 1 0.99024
## model4b 14 26381 26494 -13176 26353 0.5685 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
list("Model 1" = null_model,
"Model 2" = model1,
"Model 3" = model_equality,
"Model 4" = model3,
"Model 5a" = model4a,
"Model 5b" = model4b),
stars = TRUE,
gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
| Model 1 | Model 2 | Model 3 | Model 4 | Model 5a | Model 5b | |
|---|---|---|---|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | ||||||
| (Intercept) | 0.449* | -1.791*** | -1.763*** | -1.750*** | -1.750*** | -1.749*** |
| (0.186) | (0.180) | (0.122) | (0.133) | (0.133) | (0.133) | |
| age_z | -0.082*** | -0.083*** | -0.084*** | -0.084*** | -0.084*** | |
| (0.016) | (0.016) | (0.016) | (0.016) | (0.016) | ||
| gender | 0.383*** | 0.384*** | 0.386*** | 0.386*** | 0.386*** | |
| (0.031) | (0.031) | (0.031) | (0.031) | (0.031) | ||
| religion_z | 0.105*** | 0.102*** | 0.102*** | 0.102*** | 0.102*** | |
| (0.018) | (0.018) | (0.018) | (0.018) | (0.018) | ||
| political_ideology_z | -0.133*** | -0.132*** | -0.131*** | -0.131*** | -0.132*** | |
| (0.016) | (0.016) | (0.016) | (0.016) | (0.016) | ||
| lgb_friends | 0.882*** | 0.876*** | 0.873*** | 0.873*** | 0.875*** | |
| (0.036) | (0.036) | (0.050) | (0.050) | (0.050) | ||
| trans_friends | 0.406*** | 0.407*** | 0.404*** | 0.404*** | 0.403*** | |
| (0.060) | (0.060) | (0.060) | (0.060) | (0.060) | ||
| respondent_cooperation_z | -0.271*** | -0.269*** | -0.270*** | -0.270*** | -0.270*** | |
| (0.016) | (0.016) | (0.016) | (0.016) | (0.016) | ||
| rainbow_score_z | 0.354*** | 0.362*** | 0.363** | 0.362*** | ||
| (0.091) | (0.090) | (0.114) | (0.090) | |||
| v2x_egaldem_z | 0.480*** | 0.480*** | 0.480*** | 0.535*** | ||
| (0.095) | (0.094) | (0.094) | (0.118) | |||
| lgb_friends × rainbow_score_z | -0.001 | |||||
| (0.048) | ||||||
| lgb_friends × v2x_egaldem_z | -0.038 | |||||
| (0.050) | ||||||
| SD (Intercept country_name) | 0.982 | 0.834 | 0.458 | 0.531 | 0.531 | 0.530 |
| SD (lgb_friends country_name) | 0.173 | 0.173 | 0.169 | |||
| Cor (Intercept~lgb_friends country_name) | -0.532 | -0.532 | -0.532 | |||
| Num.Obs. | 24158 | 24158 | 24158 | 24158 | 24158 | 24158 |
| AIC | 28212.6 | 26411.0 | 26382.4 | 26379.2 | 26381.2 | 26380.7 |
| BIC | 28228.8 | 26483.8 | 26471.4 | 26484.4 | 26494.5 | 26494.0 |
Testing cross-level interactions shows that neither the interaction between LGB friends and Rainbow score (p = 0.90) nor between LGB friends and egalitarian democracy (p = 0.41) is significant. Adding these interaction terms actually worsens model fit (AIC increases).
# compare all models
anova(null_model, model1, model_equality, model3, model4a, model4b)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model3: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 + lgb_friends | country_name)
## model4a: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:rainbow_score_z + (1 + lgb_friends | country_name)
## model4b: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:v2x_egaldem_z + (1 + lgb_friends | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## null_model 2 28213 28229 -14104 28209
## model1 9 26411 26484 -13196 26393 1815.6686 7 < 2.2e-16 ***
## model_equality 11 26382 26471 -13180 26360 32.6125 2 8.285e-08 ***
## model3 13 26379 26484 -13177 26353 7.1246 2 0.02837 *
## model4a 14 26381 26494 -13177 26353 0.0001 1 0.99024
## model4b 14 26381 26494 -13176 26353 0.5685 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# use Stargazer to display the output in a nice table
model_stats_extended <- function(models, model_names) {
result <- data.frame(
Model = character(),
AIC = numeric(),
BIC = numeric(),
LogLik = numeric(),
Deviance = numeric(),
ChiSq = numeric(),
Df = numeric(),
p = numeric(),
ICC = numeric(),
stringsAsFactors = FALSE
)
# Extract basic statistics
for (i in 1:length(models)) {
model <- models[[i]]
result[i, "Model"] <- model_names[i]
result[i, "AIC"] <- AIC(model)
result[i, "BIC"] <- BIC(model)
result[i, "LogLik"] <- as.numeric(logLik(model))
result[i, "Deviance"] <- deviance(model)
result[i, "ICC"] <- performance::icc(model)$ICC_conditional
}
# Extract chi-square statistics from ANOVA comparison
anova_result <- anova(models[[1]], models[[2]], models[[3]], models[[4]], models[[5]], models[[6]])
for (i in 2:length(models)) {
result[i, "ChiSq"] <- anova_result$Chisq[i]
result[i, "Df"] <- anova_result$Df[i]
result[i, "p"] <- anova_result$`Pr(>Chisq)`[i]
}
return(result)
}
# for the labels
model_names <- c("Null model", "Individual predictors", "Country-level predictors",
"Random slopes", "Interaction (Rainbow)", "Interaction (Egalitarian)")
# extract statistics
all_models_stats <- model_stats_extended(
list(null_model, model1, model_equality, model3, model4a, model4b),
model_names)
# split into two separate tables because one would be too wide
main_models_table <- stargazer(null_model, model1, model_equality, model3,
type = "text",
title = "Main Multilevel Models for Transgender Document Change Support",
column.labels = model_names[1:4],
digits = 3,
star.cutoffs = c(0.05, 0.01, 0.001),
intercept.bottom = FALSE,
single.row = FALSE,
model.numbers = FALSE,
notes.append = FALSE,
notes = "* p<0.05; ** p<0.01; *** p<0.001",
add.lines = list(
c("AIC", round(all_models_stats$AIC[1:4], 1)),
c("BIC", round(all_models_stats$BIC[1:4], 1)),
c("Log Likelihood", round(all_models_stats$LogLik[1:4], 1)),
c("Chi-square", c("", round(all_models_stats$ChiSq[2:4], 2))),
c("df", c("", all_models_stats$Df[2:4])),
c("p-value", c("", format.pval(all_models_stats$p[2:4], digits = 3))),
c("ICC", round(all_models_stats$ICC[1:4], 3))))
##
## Main Multilevel Models for Transgender Document Change Support
## =================================================================================================
## Dependent variable:
## ------------------------------------------------------------------------
## qc19
## Null model Individual predictors Country-level predictors Random slopes
## -------------------------------------------------------------------------------------------------
## Constant 0.449* -1.791*** -1.763*** -1.750***
## (0.186) (0.180) (0.122) (0.133)
##
## age_z -0.082*** -0.083*** -0.084***
## (0.016) (0.016) (0.016)
##
## gender 0.383*** 0.384*** 0.386***
## (0.031) (0.031) (0.031)
##
## religion_z 0.105*** 0.102*** 0.102***
## (0.018) (0.018) (0.018)
##
## political_ideology_z -0.133*** -0.132*** -0.131***
## (0.016) (0.016) (0.016)
##
## lgb_friends 0.882*** 0.876*** 0.873***
## (0.036) (0.036) (0.050)
##
## trans_friends 0.406*** 0.407*** 0.404***
## (0.060) (0.060) (0.060)
##
## respondent_cooperation_z -0.271*** -0.269*** -0.270***
## (0.016) (0.016) (0.016)
##
## rainbow_score_z 0.354*** 0.362***
## (0.091) (0.090)
##
## v2x_egaldem_z 0.480*** 0.480***
## (0.095) (0.094)
##
## -------------------------------------------------------------------------------------------------
## AIC 28212.6 26411 26382.4 26379.2
## BIC 28228.8 26483.8 26471.4 26484.4
## Log Likelihood -14104.3 -13196.5 -13180.2 -13176.6
## Chi-square 1815.67 32.61 7.12
## df 7 2 2
## p-value < 2e-16 8.28e-08 0.0284
## ICC 0.227 0.154 0.043 0.043
## Observations 24,158 24,158 24,158 24,158
## Log Likelihood -14,104.320 -13,196.490 -13,180.180 -13,176.620
## Akaike Inf. Crit. 28,212.650 26,410.980 26,382.360 26,379.240
## Bayesian Inf. Crit. 28,228.830 26,483.810 26,471.380 26,484.440
## =================================================================================================
## Note: * p<0.05; ** p<0.01; *** p<0.001
interaction_models_table <- stargazer(model3, model4a, model4b,
type = "text",
title = "Interaction Models for Transgender Document Change Support",
column.labels = model_names[4:6],
digits = 3,
star.cutoffs = c(0.05, 0.01, 0.001),
intercept.bottom = FALSE,
single.row = FALSE,
model.numbers = FALSE,
notes.append = FALSE,
notes = "* p<0.05; ** p<0.01; *** p<0.001",
add.lines = list(
c("AIC", round(all_models_stats$AIC[4:6], 1)),
c("BIC", round(all_models_stats$BIC[4:6], 1)),
c("Log Likelihood", round(all_models_stats$LogLik[4:6], 1)),
c("Chi-square", c("", round(all_models_stats$ChiSq[5:6], 2))),
c("df", c("", all_models_stats$Df[5:6])),
c("p-value", c("", format.pval(all_models_stats$p[5:6], digits = 3))),
c("ICC", round(all_models_stats$ICC[4:6], 3))))
##
## Interaction Models for Transgender Document Change Support
## =========================================================================================
## Dependent variable:
## -------------------------------------------------------------
## qc19
## Random slopes Interaction (Rainbow) Interaction (Egalitarian)
## -----------------------------------------------------------------------------------------
## Constant -1.750*** -1.750*** -1.749***
## (0.133) (0.133) (0.133)
##
## age_z -0.084*** -0.084*** -0.084***
## (0.016) (0.016) (0.016)
##
## gender 0.386*** 0.386*** 0.386***
## (0.031) (0.031) (0.031)
##
## religion_z 0.102*** 0.102*** 0.102***
## (0.018) (0.018) (0.018)
##
## political_ideology_z -0.131*** -0.131*** -0.132***
## (0.016) (0.016) (0.016)
##
## lgb_friends 0.873*** 0.873*** 0.875***
## (0.050) (0.050) (0.050)
##
## trans_friends 0.404*** 0.404*** 0.403***
## (0.060) (0.060) (0.060)
##
## respondent_cooperation_z -0.270*** -0.270*** -0.270***
## (0.016) (0.016) (0.016)
##
## rainbow_score_z 0.362*** 0.363** 0.362***
## (0.090) (0.114) (0.090)
##
## v2x_egaldem_z 0.480*** 0.480*** 0.535***
## (0.094) (0.094) (0.118)
##
## lgb_friends:rainbow_score_z -0.001
## (0.048)
##
## lgb_friends:v2x_egaldem_z -0.038
## (0.050)
##
## -----------------------------------------------------------------------------------------
## AIC 26379.2 26381.2 26380.7
## BIC 26484.4 26494.5 26494
## Log Likelihood -13176.6 -13176.6 -13176.3
## Chi-square 0 0.57
## df 1 0
## p-value 0.99 NA
## ICC 0.043 0.043 0.043
## Observations 24,158 24,158 24,158
## Log Likelihood -13,176.620 -13,176.620 -13,176.330
## Akaike Inf. Crit. 26,379.240 26,381.240 26,380.670
## Bayesian Inf. Crit. 26,484.440 26,494.530 26,493.960
## =========================================================================================
## Note: * p<0.05; ** p<0.01; *** p<0.001
# Get anova results for interaction models
anova_results_interactions <- anova(model3, model4a, model4b)
stargazer(model3, model4a, model4b,
type = "text",
title = "Interaction Models for Transgender Document Change Support",
column.labels = c("Random Slopes", "Rainbow Interaction", "Egalitarian Interaction"),
covariate.labels = c("Age (z-score)", "Gender", "Religion (z-score)",
"Political ideology (z-score)", "Has LGB friends",
"Has transgender friends", "Respondent cooperation (z-score)",
"Rainbow score (z-score)", "Egalitarian democracy (z-score)",
"LGB friends × Rainbow score",
"LGB friends × Egalitarian democracy"),
digits = 3,
star.cutoffs = c(0.05, 0.01, 0.001),
add.lines = list(
c("Chi-square", "",
round(anova_results_interactions$Chisq[2], 2),
round(anova_results_interactions$Chisq[3], 2)),
c("df", "",
anova_results_interactions$Df[2],
anova_results_interactions$Df[3]),
c("p-value", "",
format.pval(anova_results_interactions$`Pr(>Chisq)`[2], digits = 3),
format.pval(anova_results_interactions$`Pr(>Chisq)`[3], digits = 3)),
c("ICC",
round(performance::icc(model3)$ICC_conditional, 3),
round(performance::icc(model4a)$ICC_conditional, 3),
round(performance::icc(model4b)$ICC_conditional, 3))),
notes = "* p<0.05; ** p<0.01; *** p<0.001",
out = "interaction_models.txt")
##
## Interaction Models for Transgender Document Change Support
## =============================================================================================
## Dependent variable:
## ---------------------------------------------------------
## qc19
## Random Slopes Rainbow Interaction Egalitarian Interaction
## (1) (2) (3)
## ---------------------------------------------------------------------------------------------
## Age (z-score) -0.084*** -0.084*** -0.084***
## (0.016) (0.016) (0.016)
##
## Gender 0.386*** 0.386*** 0.386***
## (0.031) (0.031) (0.031)
##
## Religion (z-score) 0.102*** 0.102*** 0.102***
## (0.018) (0.018) (0.018)
##
## Political ideology (z-score) -0.131*** -0.131*** -0.132***
## (0.016) (0.016) (0.016)
##
## Has LGB friends 0.873*** 0.873*** 0.875***
## (0.050) (0.050) (0.050)
##
## Has transgender friends 0.404*** 0.404*** 0.403***
## (0.060) (0.060) (0.060)
##
## Respondent cooperation (z-score) -0.270*** -0.270*** -0.270***
## (0.016) (0.016) (0.016)
##
## Rainbow score (z-score) 0.362*** 0.363** 0.362***
## (0.090) (0.114) (0.090)
##
## Egalitarian democracy (z-score) 0.480*** 0.480*** 0.535***
## (0.094) (0.094) (0.118)
##
## LGB friends × Rainbow score -0.001
## (0.048)
##
## LGB friends × Egalitarian democracy -0.038
## (0.050)
##
## Constant -1.750*** -1.750*** -1.749***
## (0.133) (0.133) (0.133)
##
## ---------------------------------------------------------------------------------------------
## Chi-square 0 0.57
## df 1 0
## p-value 0.99 NA
## ICC 0.043 0.043 0.043
## Observations 24,158 24,158 24,158
## Log Likelihood -13,176.620 -13,176.620 -13,176.330
## Akaike Inf. Crit. 26,379.240 26,381.240 26,380.670
## Bayesian Inf. Crit. 26,484.440 26,494.530 26,493.960
## =============================================================================================
## Note: *p<0.05; **p<0.01; ***p<0.001
## * p<0.05; ** p<0.01; *** p<0.001
# Get anova results for all models
anova_results_all <- anova(null_model, model1, model_equality, model3, model4a, model4b)
stargazer(null_model, model1, model_equality, model3, model4a, model4b,
type = "text",
title = "Multilevel Models for Transgender Document Change Support",
column.labels = c("Null", "Individual", "Country", "Random Slopes",
"Rainbow Int.", "Egalitarian Int."),
covariate.labels = c("Age (z-score)", "Gender", "Religion (z-score)",
"Political ideology (z-score)", "Has LGB friends",
"Has transgender friends", "Respondent cooperation (z-score)",
"Rainbow score (z-score)", "Egalitarian democracy (z-score)",
"LGB friends × Rainbow score",
"LGB friends × Egalitarian democracy"),
digits = 3,
star.cutoffs = c(0.05, 0.01, 0.001),
add.lines = list(
c("Chi-square", "",
round(anova_results_all$Chisq[2], 2),
round(anova_results_all$Chisq[3], 2),
round(anova_results_all$Chisq[4], 2),
round(anova_results_all$Chisq[5], 2),
round(anova_results_all$Chisq[6], 2)),
c("df", "",
anova_results_all$Df[2],
anova_results_all$Df[3],
anova_results_all$Df[4],
anova_results_all$Df[5],
anova_results_all$Df[6]),
c("p-value", "",
format.pval(anova_results_all$`Pr(>Chisq)`[2], digits = 3),
format.pval(anova_results_all$`Pr(>Chisq)`[3], digits = 3),
format.pval(anova_results_all$`Pr(>Chisq)`[4], digits = 3),
format.pval(anova_results_all$`Pr(>Chisq)`[5], digits = 3),
format.pval(anova_results_all$`Pr(>Chisq)`[6], digits = 3)),
c("ICC",
round(performance::icc(null_model)$ICC_conditional, 3),
round(performance::icc(model1)$ICC_conditional, 3),
round(performance::icc(model_equality)$ICC_conditional, 3),
round(performance::icc(model3)$ICC_conditional, 3),
round(performance::icc(model4a)$ICC_conditional, 3),
round(performance::icc(model4b)$ICC_conditional, 3))),
notes = "* p<0.05; ** p<0.01; *** p<0.001",
font.size = "small", # Use smaller font to help fit the table
out = "all_models.txt")
##
## Multilevel Models for Transgender Document Change Support
## ===================================================================================================================
## Dependent variable:
## -------------------------------------------------------------------------------
## qc19
## Null Individual Country Random Slopes Rainbow Int. Egalitarian Int.
## (1) (2) (3) (4) (5) (6)
## -------------------------------------------------------------------------------------------------------------------
## Age (z-score) -0.082*** -0.083*** -0.084*** -0.084*** -0.084***
## (0.016) (0.016) (0.016) (0.016) (0.016)
##
## Gender 0.383*** 0.384*** 0.386*** 0.386*** 0.386***
## (0.031) (0.031) (0.031) (0.031) (0.031)
##
## Religion (z-score) 0.105*** 0.102*** 0.102*** 0.102*** 0.102***
## (0.018) (0.018) (0.018) (0.018) (0.018)
##
## Political ideology (z-score) -0.133*** -0.132*** -0.131*** -0.131*** -0.132***
## (0.016) (0.016) (0.016) (0.016) (0.016)
##
## Has LGB friends 0.882*** 0.876*** 0.873*** 0.873*** 0.875***
## (0.036) (0.036) (0.050) (0.050) (0.050)
##
## Has transgender friends 0.406*** 0.407*** 0.404*** 0.404*** 0.403***
## (0.060) (0.060) (0.060) (0.060) (0.060)
##
## Respondent cooperation (z-score) -0.271*** -0.269*** -0.270*** -0.270*** -0.270***
## (0.016) (0.016) (0.016) (0.016) (0.016)
##
## Rainbow score (z-score) 0.354*** 0.362*** 0.363** 0.362***
## (0.091) (0.090) (0.114) (0.090)
##
## Egalitarian democracy (z-score) 0.480*** 0.480*** 0.480*** 0.535***
## (0.095) (0.094) (0.094) (0.118)
##
## LGB friends × Rainbow score -0.001
## (0.048)
##
## LGB friends × Egalitarian democracy -0.038
## (0.050)
##
## Constant 0.449* -1.791*** -1.763*** -1.750*** -1.750*** -1.749***
## (0.186) (0.180) (0.122) (0.133) (0.133) (0.133)
##
## -------------------------------------------------------------------------------------------------------------------
## Chi-square 1815.67 32.61 7.12 0 0.57
## df 7 2 2 1 0
## p-value <2e-16 8.28e-08 0.0284 0.99 NA
## ICC 0.227 0.154 0.043 0.043 0.043 0.043
## Observations 24,158 24,158 24,158 24,158 24,158 24,158
## Log Likelihood -14,104.320 -13,196.490 -13,180.180 -13,176.620 -13,176.620 -13,176.330
## Akaike Inf. Crit. 28,212.650 26,410.980 26,382.360 26,379.240 26,381.240 26,380.670
## Bayesian Inf. Crit. 28,228.830 26,483.810 26,471.380 26,484.440 26,494.530 26,493.960
## ===================================================================================================================
## Note: *p<0.05; **p<0.01; ***p<0.001
## * p<0.05; ** p<0.01; *** p<0.001
Model 3 (random slopes for LGB friends) is the best model, as it significantly improves fit over simpler models (χ² = 7.92, p = 0.019) and achieves the lowest AIC (26472.5). Subsequent interaction models fail to improve fit (p = 0.90, p = 0.41) and slightly increase AIC, confirming Model 3 as the most robust choice.
# For logistic regression, the level-1 variance is fixed at π²/3
pi_squared_by_3 <- (pi^2)/3 # approximately 3.29
# Get variance components from null model
var_null <- as.data.frame(VarCorr(null_model))
between_var_null <- var_null$vcov[1] # Between-country variance
within_var_null <- pi_squared_by_3 # Fixed residual variance for logistic models
# Get variance components from final model (model3)
var_model3 <- as.data.frame(VarCorr(model3))
between_var_model3 <- var_model3$vcov[1] # Between-country variance
within_var_model3 <- pi_squared_by_3 # Still fixed for the final model
# Calculate proportion of between-country variance explained
between_var_explained <- (between_var_null - between_var_model3) / between_var_null
# For binomial models, we can't directly calculate within-country variance explained
# We can use an approximation based on the R² measure for GLMMs
# Calculate total variance in both models
total_var_null <- between_var_null + within_var_null
total_var_model3 <- between_var_model3 + within_var_model3
# Calculate proportion of total variance explained
total_var_explained <- 1 - (total_var_model3 / total_var_null)
# Alternatively, use MuMIn package for R² calculation
library(MuMIn)
r2_model3 <- r.squaredGLMM(model3)
## Warning: the null model is only correct if all the variables it uses are identical
## to those used in fitting the original model.
# This returns two values: R²m (marginal - fixed effects only) and R²c (conditional - fixed + random effects)
cat("Proportion of between-country variance explained:", round(between_var_explained * 100, 1), "%\n")
## Proportion of between-country variance explained: 70.8 %
cat("Proportion of total variance explained:", round(total_var_explained * 100, 1), "%\n")
## Proportion of total variance explained: 16.1 %
cat("R² marginal (fixed effects only):", round(r2_model3[1], 3), "\n")
## R² marginal (fixed effects only): 0.281
cat("R² conditional (fixed + random):", round(r2_model3[2], 3), "\n")
## R² conditional (fixed + random): 0.241
library(ggplot2)
library(sjPlot)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
# 1. Plot fixed effects with error bars
p1 <- plot_model(model3, sort.est = TRUE, show.values = TRUE, value.offset = 0.3) +
theme_bw() +
labs(title = "Factors Influencing Support for Transgender Document Change",
y = "Log-Odds (95% CI)")
# 2. Create a plot showing country variation
# Extract random effects for each country
re <- ranef(model3)$country_name
re_df <- data.frame(
country = rownames(re),
intercept = re$`(Intercept)`,
lgb_slope = re$lgb_friends
)
# Sort by intercept
re_df <- re_df[order(re_df$intercept), ]
re_df$country <- factor(re_df$country, levels = re_df$country)
# Create the plot
p2 <- ggplot(re_df, aes(x = country, y = intercept)) +
geom_point() +
geom_errorbar(aes(ymin = intercept - 1.96*attr(re, "postVar")[1,1,]^0.5,
ymax = intercept + 1.96*attr(re, "postVar")[1,1,]^0.5),
width = 0.2) +
coord_flip() +
theme_bw() +
labs(title = "Country Differences in Support for Transgender Rights",
subtitle = "Random intercepts with 95% confidence intervals",
x = "",
y = "Random Intercept")
# 3. Create effect plots for key predictors
# For Rainbow Score
p3 <- plot_model(model3, type = "pred", terms = "rainbow_score_z [-2:2]",
title = "Effect of Rainbow Score on Support Level",
axis.title = c("Rainbow Score (z-score)", "Probability of Support")) +
theme_bw()
## You are calculating adjusted predictions on the population-level (i.e.
## `type = "fixed"`) for a *generalized* linear mixed model.
## This may produce biased estimates due to Jensen's inequality. Consider
## setting `bias_correction = TRUE` to correct for this bias.
## See also the documentation of the `bias_correction` argument.
# For Egalitarian Democracy
p4 <- plot_model(model3, type = "pred", terms = "v2x_egaldem_z [-2:2]",
title = "Effect of Egalitarian Democracy on Support Level",
axis.title = c("Egalitarian Democracy (z-score)", "Probability of Support")) +
theme_bw()
## Threshold plot
# (1) for religion
# simple approach to calculate thresholds
thresholds <- data.frame(
country = unique(df_reduced$country_name),
threshold = NA)
# Extract fixed effects
fixed_effects <- fixef(model3)
random_effects <- ranef(model3)$country_name
# Calculate threshold for each country based on model coefficients
for (i in 1:nrow(thresholds)) {
country_i <- thresholds$country[i]
# Get country-specific intercept
intercept <- fixed_effects["(Intercept)"] + random_effects[country_i, "(Intercept)"]
# Get religiosity coefficient
relig_coef <- fixed_effects["religion_z"]
# Calculate other effects (at reference/mean values)
other_effects <- fixed_effects["lgb_friends"] * 2 # Has LGB friends
if ("trans_friends" %in% names(fixed_effects))
other_effects <- other_effects + fixed_effects["trans_friends"] * 1.5
# Calculate religiosity threshold where log-odds = 0 (probability = 0.5)
# Solve: intercept + relig_coef * threshold + other_effects = 0
thresholds$threshold[i] <- -(intercept + other_effects) / relig_coef
}
# lot the thresholds
p5 <- ggplot(thresholds, aes(x = reorder(country, -threshold), y = threshold)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
theme_minimal() +
labs(title = "Religiosity Threshold for Supporting Transgender Rights by Country",
subtitle = "Religiosity z-score at which support probability crosses 50%",
x = "",
y = "Religiosity Threshold (z-score)")
# (2) for political ideology
# Calculate political ideology thresholds
pol_thresholds <- data.frame(
country = unique(df_reduced$country_name),
threshold = NA
)
# Extract fixed effects
fixed_effects <- fixef(model3)
random_effects <- ranef(model3)$country_name
# Calculate threshold for each country based on model coefficients
for (i in 1:nrow(pol_thresholds)) {
country_i <- pol_thresholds$country[i]
# Get country-specific intercept
intercept <- fixed_effects["(Intercept)"] + random_effects[country_i, "(Intercept)"]
# Get political ideology coefficient
pol_coef <- fixed_effects["political_ideology_z"]
# Calculate other effects (at reference/mean values)
other_effects <- fixed_effects["lgb_friends"] * 2 + # Has LGB friends
fixed_effects["trans_friends"] * 1.5 +
fixed_effects["religion_z"] * 0 # Set religion to mean
# Calculate ideology threshold where log-odds = 0 (probability = 0.5)
pol_thresholds$threshold[i] <- -(intercept + other_effects) / pol_coef
}
# Plot the thresholds
p6 <- ggplot(pol_thresholds, aes(x = reorder(country, -threshold), y = threshold)) +
geom_bar(stat = "identity", fill = "darkred") +
coord_flip() +
theme_minimal() +
labs(title = "Political Ideology Threshold for Supporting Transgender Rights by Country",
subtitle = "Political ideology z-score at which support probability crosses 50%",
x = "",
y = "Political Ideology Threshold (z-score)")
# Display plots
p1
p2
p3
p4
p5
p6